diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index afcf5adcda1..d8dc6a7580b 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -50,8 +50,8 @@
/tests/unittest/_torch/compilation @NVIDIA/trt-llm-torch-graph-compiler
/tests/unittest/_torch/multi_gpu/test_ar_residual_norm.py @NVIDIA/trt-llm-torch-graph-compiler
/tests/unittest/_torch/multi_gpu/test_user_buffers.py @NVIDIA/trt-llm-torch-graph-compiler
-/tests/unittest/_torch/test_custom_ops.py @NVIDIA/trt-llm-torch-graph-compiler
-/tests/unittest/_torch/test_autotuner.py @NVIDIA/trt-llm-torch-graph-compiler
+/tests/unittest/_torch/thop/test_custom_ops.py @NVIDIA/trt-llm-torch-graph-compiler
+/tests/unittest/_torch/misc/test_autotuner.py @NVIDIA/trt-llm-torch-graph-compiler
## TensorRT-LLM Pytorch - Attention
/tensorrt_llm/_torch/attention_backend @NVIDIA/trt-llm-torch-attention-devs
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
index ec3f13abf45..9857e15e104 100644
--- a/.github/workflows/blossom-ci.yml
+++ b/.github/workflows/blossom-ci.yml
@@ -40,7 +40,7 @@ jobs:
startsWith(github.event.comment.body, '/bot skip --comment') ||
startsWith(github.event.comment.body, '/bot reuse-pipeline') ||
startsWith(github.event.comment.body, '/bot kill')) && contains(
- fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jhaotingc","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727","zackyoray","RoeyAzran1992","viraatc","v-shobhit","yuanjingx87","uchihatmtkinu","nvrohanv","vegaluisjose","qsang-nv","ChunhuanLin","timlee0212","venkywonka","zbpatel","tijyojwad","shyeh25","zihaok","nv-yilinf","ttyio","farazkh80","yuantailing","JennyLiu-nv","moraxu","IzzyPutterman","nvchenghaoz","nvxuanyuc","poweiw","stnie","zhanga5","nzmora-nvidia","greg-kwasniewski1","linda-stadter","Tom-Zheng","vanshilshah97","ixlmar","MatthiasKohl","Wanli-Jiang", "arekay", "davidclark-nv", "2ez4bz", "tcherckez-nvidia", "MrGeva", "galagam", "limin2021", "dhansen-nvidia","talorabr","kanghui0204","wu6u3tw","hvagadia","xavier-nvidia","raayandhar","dbari","nvjullin","elvischenv","zhenhuaw-me","weireweire","yifeizhang-c","jiaganc","ziyixiong-nv","FelixXidddd","JunyiXu-nv","bo-nv","zerollzeng","RayenTian","ameynaik-hub","raymochen","shuyixiong","johncalesp","leslie-fang25","reasonsolo","zhou-yuxin","vadiklyutiy","yali-arch","NVShreyas","h-guo18","pengbowang-nv","lancelly","heyuhhh","mayani-nv","flin3500","sunnyqgg","kris1025"]'),
+ fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jhaotingc","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727","zackyoray","RoeyAzran1992","viraatc","v-shobhit","yuanjingx87","uchihatmtkinu","nvrohanv","vegaluisjose","qsang-nv","ChunhuanLin","timlee0212","venkywonka","zbpatel","tijyojwad","shyeh25","zihaok","nv-yilinf","ttyio","farazkh80","yuantailing","JennyLiu-nv","moraxu","IzzyPutterman","nvchenghaoz","nvxuanyuc","poweiw","stnie","zhanga5","nzmora-nvidia","greg-kwasniewski1","linda-stadter","Tom-Zheng","vanshilshah97","ixlmar","MatthiasKohl","Wanli-Jiang", "arekay", "davidclark-nv", "2ez4bz", "tcherckez-nvidia", "MrGeva", "galagam", "limin2021", "dhansen-nvidia","talorabr","kanghui0204","wu6u3tw","hvagadia","xavier-nvidia","raayandhar","dbari","nvjullin","elvischenv","zhenhuaw-me","weireweire","yifeizhang-c","jiaganc","ziyixiong-nv","FelixXidddd","JunyiXu-nv","bo-nv","zerollzeng","RayenTian","ameynaik-hub","raymochen","shuyixiong","johncalesp","leslie-fang25","reasonsolo","zhou-yuxin","vadiklyutiy","yali-arch","NVShreyas","h-guo18","pengbowang-nv","lancelly","heyuhhh","mayani-nv","flin3500","sunnyqgg","kris1025", "karljang"]'),
github.actor)
steps:
- name: Check if comment is issued by authorized person
diff --git a/README.md b/README.md
index f6625a05596..1559ee4d00c 100644
--- a/README.md
+++ b/README.md
@@ -18,10 +18,9 @@ TensorRT-LLM
## Tech Blogs
-* [08/06] Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
+* [08/05] Running a High-Performance GPT-OSS-120B Inference Server with TensorRT-LLM
✨ [➡️ link](./docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md)
-
* [08/01] Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
✨ [➡️ link](./docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md)
@@ -44,6 +43,7 @@ TensorRT-LLM
✨ [➡️ link](./docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md)
## Latest News
+* [08/05] 🌟 TensorRT-LLM delivers Day-0 support for OpenAI's latest open-weights models: GPT-OSS-120B [➡️ link](https://huggingface.co/openai/gpt-oss-120b) and GPT-OSS-20B [➡️ link](https://huggingface.co/openai/gpt-oss-20b)
* [07/15] 🌟 TensorRT-LLM delivers Day-0 support for LG AI Research's latest model, EXAONE 4.0 [➡️ link](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B)
* [06/17] Join NVIDIA and DeepInfra for a developer meetup on June 26 ✨ [➡️ link](https://events.nvidia.com/scaletheunscalablenextgenai)
* [05/22] Blackwell Breaks the 1,000 TPS/User Barrier With Meta’s Llama 4 Maverick
diff --git a/cpp/include/tensorrt_llm/common/logger.h b/cpp/include/tensorrt_llm/common/logger.h
index df84e226389..c8164b10e55 100644
--- a/cpp/include/tensorrt_llm/common/logger.h
+++ b/cpp/include/tensorrt_llm/common/logger.h
@@ -54,20 +54,21 @@ class Logger
#if defined(_MSC_VER)
template
- void log(Level level, char const* format, Args const&... args);
+ void log(Level const level, char const* format, Args const&... args);
template
- void log(Level level, int rank, char const* format, Args const&... args);
+ void log(Level const level, int const rank, char const* format, Args const&... args);
#else
template
- void log(Level level, char const* format, Args const&... args) __attribute__((format(printf, 3, 0)));
+ void log(Level const level, char const* format, Args const&... args) __attribute__((format(printf, 3, 0)));
template
- void log(Level level, int rank, char const* format, Args const&... args) __attribute__((format(printf, 4, 0)));
+ void log(Level const level, int const rank, char const* format, Args const&... args)
+ __attribute__((format(printf, 4, 0)));
#endif
template
- void log(Level level, std::string const& format, Args const&... args)
+ void log(Level const level, std::string const& format, Args const&... args)
{
return log(level, format.c_str(), args...);
}
@@ -134,7 +135,7 @@ class Logger
};
template
-void Logger::log(Logger::Level level, char const* format, Args const&... args)
+void Logger::log(Logger::Level const level, char const* format, Args const&... args)
{
if (isEnabled(level))
{
diff --git a/cpp/kernels/fmha_v2/fmha_test.py b/cpp/kernels/fmha_v2/fmha_test.py
index bd743d829a0..24e12f54634 100644
--- a/cpp/kernels/fmha_v2/fmha_test.py
+++ b/cpp/kernels/fmha_v2/fmha_test.py
@@ -165,8 +165,8 @@ def test_trtllm_context_mla_attention_fmha(dtype, s):
if dtype == "-bf16" and s == 4096:
epsilon += ' -epsilon 0.03'
- if dtype in ["-e4m3", "-e4m3 -bf16-output"] and sm_version != 120:
- pytest.skip("FP8 MLAs are only supported on sm120 currently.")
+ if dtype in ["-e4m3", "-e4m3 -bf16-output"] and sm_version not in [90, 120]:
+ pytest.skip("FP8 MLAs are only supported on sm90 and sm120 currently.")
# Context phase kernels, always use separate-q-k-v layout.
subprocess.run(
diff --git a/cpp/kernels/fmha_v2/setup.py b/cpp/kernels/fmha_v2/setup.py
index 220b7898a98..2d8a6b416a6 100644
--- a/cpp/kernels/fmha_v2/setup.py
+++ b/cpp/kernels/fmha_v2/setup.py
@@ -1914,8 +1914,9 @@ def enable_mutex(kspec):
def enable_tma_store(kspec):
+ output_dtype = kspec.output_dtype if kspec.output_dtype is not None else kspec.dtype
# TMA copies data in the 16B granularity.
- return 'true' if (kspec.dtype in ['e4m3', 'e4m3_fp32']
+ return 'true' if (output_dtype in ['e4m3', 'e4m3_fp32']
and kspec.head_size % 16 == 0) else 'false'
@@ -3812,7 +3813,9 @@ def enumerate_qgmma_flash_warpspec_kernels(specs,
# use specialized kernels for cases without alibi scales.
# there is a numeric issues when applying the exp2f scale optimization and alibi scale at the same time.
combinations = product([False, True], \
- [InputLayout.PACKED_QKV, InputLayout.CONTIGUOUS_Q_KV, InputLayout.Q_PAGED_KV], [False, True])
+ [InputLayout.PACKED_QKV, InputLayout.CONTIGUOUS_Q_KV,
+ InputLayout.Q_PAGED_KV, InputLayout.SEPARATE_Q_K_V],
+ [False, True])
for (alibi, input_layout, enable_attn_logit_softcapping) in combinations:
# alibi and bmm1_tanh_scale shouldn't be used together.
if alibi and enable_attn_logit_softcapping:
@@ -3911,7 +3914,7 @@ def enumerate_qgmma_flash_warpspec_kernels(specs,
has_noloop=0,
noloop_step=64,
kv_loop_step=
- 128, # use 64 kv step size to avoid register spilling
+ 128, # use 128 kv step size to avoid register spilling
kv_tile_buffers=2, # only used by warp specialized kernels
unroll_threshold=1,
has_scale_max=False,
@@ -3926,6 +3929,46 @@ def enumerate_qgmma_flash_warpspec_kernels(specs,
sage_block_sizes=sage_block_sizes,
output_dtype=output_dtype))
+ # context MLA (192x128)
+ # we could use param 'output_dtype' of enumerate_qgmma_flash_warpspec_kernels(),
+ # but it will generate many unnecessary kernels and they are not easy to filter out.
+ for output_type in [None, 'bf16']:
+ specs.append(
+ kernel_spec(
+ sm=sm,
+ sm_mma=90,
+ dtype=dtype,
+ seq_len=0, # support any sequence length
+ head_size=192,
+ head_size_v=128,
+ warps_m=4, #4x1 warpgroups
+ warps_n=1,
+ version=2,
+ interleaved=False,
+ ldgsts_q=
+ False, # for Hopper kernels, ldgsts = False signals TMA usage.
+ ldgsts_k=False,
+ ldgsts_v=False,
+ share_smem_k_v=False,
+ loop_step=64,
+ q_tile_buffers=1, # only used by warp specialized kernels
+ has_noloop=0,
+ noloop_step=64,
+ kv_loop_step=128,
+ kv_tile_buffers=2, # only used by warp specialized kernels
+ unroll_threshold=1,
+ has_scale_max=False,
+ flash_attention=True,
+ warp_specialization=True,
+ alibi=alibi,
+ enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+ return_softmax_stats=
+ False, # return softmax stats is not supported for fp8 now
+ scheduling_mode=scheduling_mode,
+ input_layout=input_layout,
+ sage_block_sizes=sage_block_sizes,
+ output_dtype=output_type))
+
def enumerate_igmma_kernels(specs, sm=90):
specs.append(
@@ -6377,7 +6420,7 @@ def enumerate_kernels():
and kspec.tiled == True)
# Deepseek MLA (context 192/128 separate-q-k-v)
or (kspec.sm in [90, 100, 120]
- and kspec.dtype in ['bf16', 'e4m3_fp32']
+ and kspec.dtype in ['bf16', 'e4m3', 'e4m3_fp32']
and kspec.head_size == 192
and kspec.head_size_v == 128
and kspec.input_layout == InputLayout.SEPARATE_Q_K_V
diff --git a/cpp/kernels/fmha_v2/src/fmha/hopper/gmem_tile_o_packed.h b/cpp/kernels/fmha_v2/src/fmha/hopper/gmem_tile_o_packed.h
index 75946bac612..e186ab8ad26 100644
--- a/cpp/kernels/fmha_v2/src/fmha/hopper/gmem_tile_o_packed.h
+++ b/cpp/kernels/fmha_v2/src/fmha/hopper/gmem_tile_o_packed.h
@@ -1222,6 +1222,14 @@ struct Gmem_tile_o_qgmma_fp32_16bits
inline __device__ Gmem_tile_o_qgmma_fp32_16bits(
Params const& params, Block_info const& block_info, Shared&&, int tidx, int cta_row_offset = 0)
: params_o_stride_in_bytes_(params.o_stride_in_bytes)
+ , params_scale_bmm2_(
+#ifdef GENERATE_CUBIN
+ // Specialized for trt-llm generated cubins only.
+ params.scale_bmm2_d ? *params.scale_bmm2_d : params.scale_bmm2
+#else
+ params.scale_bmm2
+#endif
+ )
, actual_seqlen_(block_info.actual_seqlen)
, o_ptr_(reinterpret_cast(params.o_ptr))
{
@@ -1251,21 +1259,24 @@ struct Gmem_tile_o_qgmma_fp32_16bits
inline __device__ void store(Accumulators const (&acc)[M][N])
{
int64_t const step_m = 8 * params_o_stride_in_bytes_;
- // we assume M = 1. some shortcuts.
- static_assert(M == 1);
-
-#define STORE_COLUMN(idx) \
- { \
- float _reg0 = acc[0][mma_ni].elt(((ci + 0) * ROWS_PER_THREAD + ri) * 2 + idx); \
- float _reg1 = acc[0][mma_ni].elt(((ci + 1) * ROWS_PER_THREAD + ri) * 2 + idx); \
- static_assert(std::is_same_v || std::is_same_v); \
- uint32_t _out = fmha::float2_to_16bit_2(_reg0, _reg1); \
- int64_t _offset = (int64_t) ri * step_m + (int64_t) (ci + mma_ni * COLS_PER_THREAD) * STEP_N; \
- fmha::stg(o_ptr_ + _offset + 4 * idx, _out); \
- }
+#ifdef UNIFIED_EPILOGUE_SCALE
+ constexpr bool Scale = false;
+#else
+ constexpr bool Scale = true;
+#endif
#define STORE_COLUMNS() \
{ \
- STORE_COLUMN(0) STORE_COLUMN(1) \
+ /* we assume M = 1. some shortcuts. */ \
+ static_assert(M == 1); \
+ uint4 _src = { \
+ .x = acc[0][mma_ni].reg(((ci + 0) * ROWS_PER_THREAD + ri) * 2), \
+ .y = acc[0][mma_ni].reg(((ci + 1) * ROWS_PER_THREAD + ri) * 2), \
+ .z = acc[0][mma_ni].reg(((ci + 0) * ROWS_PER_THREAD + ri) * 2 + 1), \
+ .w = acc[0][mma_ni].reg(((ci + 1) * ROWS_PER_THREAD + ri) * 2 + 1), \
+ }; \
+ uint2 _dst = Acc_packer::run(this, _src); \
+ int64_t _offset = (int64_t) ri * step_m + (int64_t) (ci + mma_ni * COLS_PER_THREAD) * STEP_N; \
+ fmha::stg(o_ptr_ + _offset, _dst); \
}
#pragma unroll
@@ -1303,6 +1314,10 @@ struct Gmem_tile_o_qgmma_fp32_16bits
// The stride between rows for the QKV matrice.
int64_t params_o_stride_in_bytes_;
+ // Scaling factor; this usually means QKV descale factor in actuality
+ uint32_t params_scale_bmm2_;
+ // Scaling factor; this usually means QKV descale factor in actuality
+ uint32_t params_scale_bmm2_;
// The pointer.
char* o_ptr_;
// The row loaded by this thread.
diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h
index c8e3c318d6f..1a8853a03ec 100644
--- a/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h
+++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h
@@ -755,7 +755,7 @@ struct DMA
for (int kgroup_idx = 0; kgroup_idx < Kernel_traits::BMM2_K_GROUPS; kgroup_idx++)
{
#pragma unroll
- for (int dgroup_idx = 0; dgroup_idx < Kernel_traits::D_GROUPS; dgroup_idx++)
+ for (int dgroup_idx = 0; dgroup_idx < Kernel_traits::DV_GROUPS; dgroup_idx++)
{
// Src smem block is k first then d
uint32_t src_offset = (kgroup_idx * Kernel_traits::BMM2_K_PER_GROUP * Kernel_traits::D_PER_GROUP
@@ -764,7 +764,7 @@ struct DMA
// Dst smem block is d first then k
uint32_t dst_offset = (dgroup_idx * Kernel_traits::BMM2_K_PER_GROUP * Kernel_traits::D_PER_GROUP
- + kgroup_idx * Kernel_traits::BMM2_K_PER_GROUP * Kernel_traits::D)
+ + kgroup_idx * Kernel_traits::BMM2_K_PER_GROUP * Kernel_traits::DV)
* Kernel_traits::ELEMENT_BYTES;
transposer.template transpose_(smem_v_src + src_offset, smem_v_dst + dst_offset);
diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h
index 8c93ce8a988..e12847d7659 100644
--- a/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h
+++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h
@@ -589,7 +589,8 @@ struct Kernel_traits_Hopper_qgmma_e4m3_fp32
// Base class.
using Base = Kernel_traits;
+ SCHEDULING_MODE_, INPUT_LAYOUT_, USE_TMA_STORE_, ENABLE_BMM1_SOFTCAPPING_SCALE_, RETURN_SOFTMAX_STATS_,
+ OutputType, SAGE_BLOCK_SIZE_Q_, SAGE_BLOCK_SIZE_K_, SAGE_BLOCK_SIZE_V_>;
enum
{
diff --git a/cpp/kernels/xqa/mha_sm90.cu b/cpp/kernels/xqa/mha_sm90.cu
index 9a438df9a2a..da44fba60c4 100644
--- a/cpp/kernels/xqa/mha_sm90.cu
+++ b/cpp/kernels/xqa/mha_sm90.cu
@@ -1012,7 +1012,7 @@ CUBIN_EXPORT __global__
if (threadIdx.x < smem.gemm1AccColMax.size)
{
auto const idx = threadIdx.x;
- smem.gemm1AccColMax[idx] = mha::numeric_limits::lowest();
+ smem.gemm1AccColMax[idx] = safeInitRowMax;
smem.gemm1AccColSum[idx] = 0;
}
smem.gemm1WarpGrpBar.arrive_and_wait();
@@ -1949,7 +1949,7 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec,
uint32_t const globalRow = tileStartRow + row;
if (globalRow >= cacheSeqLen)
{
- acc(m, n)(i, j) = mha::numeric_limits::lowest();
+ acc(m, n)(i, j) = safeInitRowMax;
continue;
}
if (globalRow >= maskStartRow)
@@ -1957,7 +1957,7 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec,
uint32_t const maskRow = globalRow - maskStartRow;
if ((bit_mask >> maskRow) == 0)
{
- acc(m, n)(i, j) = mha::numeric_limits::lowest();
+ acc(m, n)(i, j) = safeInitRowMax;
}
}
}
@@ -2087,7 +2087,7 @@ __device__ inline void warpGrpApplyMask(uint32_t warpRank, Gemm0Acc& acc, uint32
#pragma unroll
for (uint32_t j = 0; j < GmmaAccCoreMat::cols; j++)
{
- acc(m, n)(i, j) = mha::numeric_limits::lowest();
+ acc(m, n)(i, j) = safeInitRowMax;
}
}
}
@@ -2380,9 +2380,9 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec,
{
uint32_t const col = GmmaAccCoreMat::cols * (4 * n + idxInQuad) + j;
assert((col < nbValidCols) == bool(endMask & (1ULL << col)));
- if (((mask >> col) & 1) == 0)
+ if ((mask & (1ULL << col)) == 0)
{
- acc(m, n)(i, j) = mha::numeric_limits::lowest();
+ acc(m, n)(i, j) = safeInitRowMax;
}
}
}
@@ -2410,7 +2410,7 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, uint32_t validColBeg, uin
#pragma unroll
for (uint32_t i = 0; i < GmmaAccCoreMat::rows; i++)
{
- acc(m, n)(i, j) = mha::numeric_limits::lowest();
+ acc(m, n)(i, j) = safeInitRowMax;
}
}
}
diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp
index bbe2e2fd179..866a0d71c8f 100644
--- a/cpp/tensorrt_llm/common/attentionOp.cpp
+++ b/cpp/tensorrt_llm/common/attentionOp.cpp
@@ -285,6 +285,9 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams&
xqaParams.fp4_out_sf_scale = generationsParams.attention_output_sf_scale;
xqaParams.start_token_idx_sf = generationsParams.start_token_idx_sf;
+ // Cross attention parameters.
+ xqaParams.encoder_input_lengths = generationsParams.encoder_input_lengths;
+
return true;
}
@@ -2229,6 +2232,10 @@ int AttentionOp::enqueueGeneration(EnqueueGenerationParams const& params, cud
{
TLLM_CHECK_WITH_INFO(false, "No available kernels are found for FP4 output.");
}
+ else
+ {
+ TLLM_LOG_DEBUG("XQA kernels are not selected in the generation phase.");
+ }
}
// This is the number of kv tokens that q needs to visit, but excluding one as it will be processed before the kv
@@ -2750,7 +2757,7 @@ int AttentionOp::initialize() noexcept
!useCustomMask() || mEnableContextFMHA, "Only Context FMHA supports custom mask input currently.");
}
- mEnableXQA = (mEnableXQA || mIsSpecDecodingEnabled) && !mCrossAttention
+ mEnableXQA = (mEnableXQA || mIsSpecDecodingEnabled)
&& (mType == nvinfer1::DataType::kHALF || mType == nvinfer1::DataType::kBF16) && mUseKVCache;
if (mEnableXQA)
diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
index 2332950629f..0ce601d5b08 100644
--- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
+++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
@@ -1524,6 +1524,11 @@ struct CollectiveMmaArrayMixedInput<
CUTLASS_DEVICE void tensormaps_cp_fence_release(
TensorMapStorage& shared_tensormaps, cute::tuple const& input_tensormaps)
{
+ if (cute::elect_one_sync())
+ {
+ cute::tma_desc_commit_group();
+ cute::tma_desc_wait_group();
+ }
// Entire warp must do this (i.e. it's aligned)
tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
index fbcd279420d..0c2f3aed72b 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
@@ -260,6 +260,8 @@ extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_tma_ws_sm90
extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
+extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
+extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
@@ -1969,6 +1971,10 @@ static const struct FusedMultiHeadAttentionKernelMetaInfoV2
{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 180480, 384, 64, 2, 2, false, true, true, true, false, false, true, false, nullptr},
{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_causal_softcapping_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90},
{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 3, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_tma_ws_sm90},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 3, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_tma_ws_sm90},
+{ DATA_TYPE_E4M3, DATA_TYPE_BF16, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90_kernel", 164096, 384, 64, 0, 3, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90},
+{ DATA_TYPE_E4M3, DATA_TYPE_BF16, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_causal_output_bf16_tma_ws_sm90_kernel", 164096, 384, 64, 1, 3, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90},
{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_causal_alibi_tma_ws_sm90_kernel", 82304, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90},
{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90},
{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90},
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h
index 97ad58335fc..fcf8ab3851f 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h
@@ -106,6 +106,9 @@ struct XQAParams
void* quant_q_buffer_ptr = nullptr;
+ // for cross attention
+ int32_t const* encoder_input_lengths = nullptr;
+
cudaStream_t stream = 0;
std::string toString() const
@@ -175,6 +178,7 @@ struct XQAParams
<< "total_num_input_tokens :" << total_num_input_tokens << std ::endl
<< "is_fp8_output :" << (is_fp8_output ? "true" : "false") << std ::endl
<< "fp8_out_scale :" << fp8_out_scale << std ::endl
+ << "encoder_input_lengths: " << encoder_input_lengths << std::endl
<< "stream :" << stream;
return ss.str();
diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
index 65e55c65fba..4442e2c2369 100644
--- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
+++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
@@ -1348,15 +1348,17 @@ __global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams(global_token_idx) * params.kv_hidden_size * 2 + hidden_idx_kv;
- auto const src_v_idx
- = static_cast(global_token_idx) * params.kv_hidden_size * 2 + src_v_offset + hidden_idx_kv;
-
- // Only load K,V tokens from encoder qkv input.
- auto k = *reinterpret_cast(¶ms.cross_kv_input[src_k_idx]);
- auto v = *reinterpret_cast(¶ms.cross_kv_input[src_v_idx]);
-
- // The kv cache pointers.
- auto k_cache_block_ptr
- = reinterpret_cast(params.kv_cache_buffer.getKBlockPtr(batch_idx, token_idx));
- auto v_cache_block_ptr
- = reinterpret_cast(params.kv_cache_buffer.getVBlockPtr(batch_idx, token_idx));
- // The vector idx in the cache block.
- auto block_vec_idx
- = params.kv_cache_buffer.getKVLocalIdx(token_idx, kv_head_idx, VECS_PER_HEAD, head_dim_vec_idx);
-
- // Store K and V to the cache.
- // INT8/FP8 kv cache.
- if constexpr (sizeof(TCache) == 1)
- {
- // The element index inside the block.
- auto block_elt_idx = block_vec_idx * ELTS_PER_VEC;
- // Store 8bits kv cache.
- mmha::store_8bits_vec(k_cache_block_ptr, k, block_elt_idx, scale_orig_quant);
- mmha::store_8bits_vec(v_cache_block_ptr, v, block_elt_idx, scale_orig_quant);
- }
- else
+ // Encoder tokens (i.e. KV tokens).
+ if (head_idx == (kv_head_idx * params.qheads_per_kv_head) && token_idx < encoder_seq_len
+ && store_encoder_kv_cache && params.kv_cache_buffer.data != nullptr)
{
- reinterpret_cast(k_cache_block_ptr)[block_vec_idx] = k;
- reinterpret_cast(v_cache_block_ptr)[block_vec_idx] = v;
+ // The global token idx in all sequences.
+ int global_token_idx = token_idx + encoder_seq_offset;
+
+ // The memory offset.
+ auto const src_k_idx
+ = static_cast(global_token_idx) * params.kv_hidden_size * 2 + hidden_idx_kv;
+ auto const src_v_idx
+ = static_cast(global_token_idx) * params.kv_hidden_size * 2 + src_v_offset + hidden_idx_kv;
+
+ // Only load K,V tokens from encoder qkv input.
+ auto k = *reinterpret_cast(¶ms.cross_kv_input[src_k_idx]);
+ auto v = *reinterpret_cast(¶ms.cross_kv_input[src_v_idx]);
+
+ // The kv cache pointers.
+ auto k_cache_block_ptr
+ = reinterpret_cast(params.kv_cache_buffer.getKBlockPtr(batch_idx, token_idx));
+ auto v_cache_block_ptr
+ = reinterpret_cast(params.kv_cache_buffer.getVBlockPtr(batch_idx, token_idx));
+ // The vector idx in the cache block.
+ auto block_vec_idx
+ = params.kv_cache_buffer.getKVLocalIdx(token_idx, kv_head_idx, VECS_PER_HEAD, head_dim_vec_idx);
+
+ // Store K and V to the cache.
+ // INT8/FP8 kv cache.
+ if constexpr (sizeof(TCache) == 1)
+ {
+ // The element index inside the block.
+ auto block_elt_idx = block_vec_idx * ELTS_PER_VEC;
+ // Store 8bits kv cache.
+ mmha::store_8bits_vec(k_cache_block_ptr, k, block_elt_idx, scale_orig_quant);
+ mmha::store_8bits_vec(v_cache_block_ptr, v, block_elt_idx, scale_orig_quant);
+ }
+ else
+ {
+ reinterpret_cast(k_cache_block_ptr)[block_vec_idx] = k;
+ reinterpret_cast(v_cache_block_ptr)[block_vec_idx] = v;
+ }
}
}
}
diff --git a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp
index 34fe1780fe0..956d1919ced 100644
--- a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp
+++ b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp
@@ -16,7 +16,9 @@
#include "xqaDispatcher.h"
#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h"
#include "tensorrt_llm/kernels/unfusedAttentionKernels.h"
+#include
namespace
{
@@ -38,6 +40,87 @@ constexpr inline T roundUp(T a, T b)
namespace tensorrt_llm::kernels
{
+namespace
+{
+
+template
+QKVPreprocessingParams makeQKVPreprocessingParams(XQAParams const& params,
+ XQALaunchParam const& launchParams, void* xqa_q_input_ptr, Data_type QDataType,
+ KvCacheDataType cache_type, int32_t batch_beam_size, KVCacheBuffer const& kv_cache_buffer,
+ int32_t const* cu_seqlens, int32_t const* cu_kv_seqlens, float const* rotary_inv_freq_buf, int multiProcessorCount)
+{
+ QKVPreprocessingParams preprocessingParms;
+ memset(&preprocessingParms, 0, sizeof(preprocessingParms));
+ // Set parameters.
+ preprocessingParms.qkv_input = static_cast(const_cast(params.qkv));
+ preprocessingParms.q_output = static_cast(xqa_q_input_ptr);
+ preprocessingParms.kv_cache_buffer = kv_cache_buffer;
+ preprocessingParms.kv_cache_block_scales_buffer = {};
+ preprocessingParms.qkv_bias = static_cast(params.qkv_bias);
+ // Prepare values for fmha.
+ preprocessingParms.fmha_bmm1_scale = launchParams.bmm1_scale_ptr;
+ preprocessingParms.fmha_bmm2_scale = launchParams.bmm2_scale_ptr;
+ bool const is_fp8_q_input = (QDataType == DATA_TYPE_E4M3);
+ if (params.kv_cache_quant_mode.hasFp8KvCache())
+ {
+ preprocessingParms.q_scale_quant_orig = params.kv_scale_quant_orig;
+ preprocessingParms.kv_scale_quant_orig = params.kv_scale_quant_orig;
+ }
+ if (params.is_fp8_output)
+ {
+ preprocessingParms.o_scale_orig_quant = params.fp8_out_scale;
+ }
+ // Buffers.
+ preprocessingParms.logn_scaling = params.logn_scaling_ptr;
+ preprocessingParms.seq_lens = params.spec_decoding_generation_lengths;
+ preprocessingParms.cache_seq_lens = params.sequence_lengths;
+ preprocessingParms.cu_seq_lens = cu_seqlens;
+ preprocessingParms.rotary_embedding_inv_freq = rotary_inv_freq_buf;
+ preprocessingParms.rotary_coef_cache_buffer = params.rotary_cos_sin;
+ preprocessingParms.kvScaleOrigQuant = params.kv_scale_orig_quant;
+ preprocessingParms.kv_cache_scale_factors = nullptr;
+ preprocessingParms.spec_decoding_position_offsets
+ = params.cross_attention ? nullptr : params.spec_decoding_position_offsets;
+ preprocessingParms.mrope_position_deltas = params.mrope_position_deltas;
+ // Scalar parameters.
+ preprocessingParms.batch_size = int(batch_beam_size);
+ preprocessingParms.max_input_seq_len = params.generation_input_length;
+ preprocessingParms.max_kv_seq_len = params.max_past_kv_length;
+ preprocessingParms.cyclic_kv_cache_len
+ = params.cross_attention ? params.max_past_kv_length : params.cyclic_attention_window_size;
+ preprocessingParms.sink_token_len = params.cross_attention ? 0 : params.sink_token_length;
+ preprocessingParms.token_num = params.total_num_input_tokens;
+ preprocessingParms.remove_padding = true;
+ preprocessingParms.cross_attention = params.cross_attention;
+ preprocessingParms.head_num = params.num_q_heads;
+ preprocessingParms.kv_head_num = params.num_kv_heads;
+ preprocessingParms.qheads_per_kv_head = params.num_q_heads / params.num_kv_heads;
+ preprocessingParms.size_per_head = params.head_size;
+ preprocessingParms.fmha_host_bmm1_scale = 1.0f / (sqrtf(params.head_size * 1.0f) * params.q_scaling);
+ preprocessingParms.rotary_embedding_dim = params.rotary_embedding_dim;
+ preprocessingParms.rotary_embedding_base = params.rotary_embedding_base;
+ preprocessingParms.rotary_scale_type = params.rotary_embedding_scale_type;
+ preprocessingParms.rotary_embedding_scale = params.rotary_embedding_scale;
+ preprocessingParms.rotary_embedding_max_positions = params.rotary_embedding_max_positions;
+ preprocessingParms.position_embedding_type = params.position_embedding_type;
+ preprocessingParms.position_shift_enabled = params.position_shift_enabled;
+ preprocessingParms.cache_type = cache_type;
+ preprocessingParms.separate_q_kv_output = true;
+ preprocessingParms.quantized_fp8_output = is_fp8_q_input;
+ preprocessingParms.generation_phase = true;
+ preprocessingParms.multi_processor_count = multiProcessorCount;
+ preprocessingParms.rotary_vision_start = params.rotary_vision_start;
+ preprocessingParms.rotary_vision_length = params.rotary_vision_length;
+
+ // Cross-attention only.
+
+ preprocessingParms.encoder_seq_lens = params.encoder_input_lengths;
+
+ return preprocessingParms;
+}
+
+} // namespace
+
////////////////////////////////////////////////////////////////////////////////////////////////////
XqaDispatcher::XqaDispatcher(XqaFixedParams fixedParams)
@@ -137,9 +220,10 @@ bool XqaDispatcher::shouldUse(XQAParams const& params)
{
SHOULD_NOT_USE("Fallback to MMHA as unidirectional is not supported by TRTLLM-GEN kernels.");
}
- if (params.cross_attention)
+ if (params.cross_attention && !params.paged_kv_cache)
{
- SHOULD_NOT_USE("Fallback to MMHA as cross attention is not supported by TRTLLM-GEN kernels.");
+ SHOULD_NOT_USE(
+ "Fallback to MMHA as cross attention without paged KV Cache is not supported by TRTLLM-GEN kernels.");
}
if (params.paged_kv_cache && params.tokens_per_block < 8)
{
@@ -252,8 +336,8 @@ void XqaDispatcher::runImpl(XQAParams params, KVCacheBuffer const& kv_cache_buff
decoder_params.seqQOffsets = launchParams.cu_seq_lens;
decoder_params.seqKVOffsets = launchParams.cu_kv_seq_lens;
decoder_params.seqQLengths = params.spec_decoding_generation_lengths;
- decoder_params.seqKVLengths = params.sequence_lengths;
- decoder_params.batchSize = int(batch_beam_size);
+ decoder_params.seqKVLengths = params.cross_attention ? params.encoder_input_lengths : params.sequence_lengths;
+ decoder_params.batchSize = static_cast(batch_beam_size);
decoder_params.maxQSeqLength = params.generation_input_length;
decoder_params.numTokens = params.total_num_input_tokens;
decoder_params.removePadding = true;
@@ -273,10 +357,12 @@ void XqaDispatcher::runImpl(XQAParams params, KVCacheBuffer const& kv_cache_buff
float const* rotary_inv_freq_buf = params.rotary_embedding_inv_freq_cache;
// Use the nullptr for cu_seqlens when it is not computed.
int const* cu_seqlens{nullptr};
+ int const* cu_kv_seqlens{nullptr};
if (decoder_params.isBuildDecoderInfoKernelNeeded())
{
rotary_inv_freq_buf = launchParams.rotary_inv_freq_buf;
cu_seqlens = launchParams.cu_seq_lens;
+ cu_kv_seqlens = launchParams.cu_kv_seq_lens;
invokeBuildDecoderInfo(decoder_params, params.stream);
sync_check_cuda_error(params.stream);
}
@@ -285,66 +371,10 @@ void XqaDispatcher::runImpl(XQAParams params, KVCacheBuffer const& kv_cache_buff
// NOTE: MHA kernels should read kv cache that has already been appended with new tokens' kv cache.
void* xqa_q_input_ptr = inputScratch;
// The preprocessing kernel that applies RoPE and updates kv cache.
- QKVPreprocessingParams preprocessingParms;
- memset(&preprocessingParms, 0, sizeof(preprocessingParms));
- // Set parameters.
- preprocessingParms.qkv_input = static_cast(const_cast(params.qkv));
- preprocessingParms.q_output = static_cast(xqa_q_input_ptr);
- preprocessingParms.kv_cache_buffer = kv_cache_buffer;
- preprocessingParms.kv_cache_block_scales_buffer = {};
- preprocessingParms.qkv_bias = static_cast(params.qkv_bias);
- // Prepare values for fmha.
- preprocessingParms.fmha_bmm1_scale = launchParams.bmm1_scale_ptr;
- preprocessingParms.fmha_bmm2_scale = launchParams.bmm2_scale_ptr;
- bool const is_fp8_q_input = (mQDataType == DATA_TYPE_E4M3);
- if (params.kv_cache_quant_mode.hasFp8KvCache())
- {
- preprocessingParms.q_scale_quant_orig = params.kv_scale_quant_orig;
- preprocessingParms.kv_scale_quant_orig = params.kv_scale_quant_orig;
- }
- if (params.is_fp8_output)
- {
- preprocessingParms.o_scale_orig_quant = params.fp8_out_scale;
- }
- // Buffers.
- preprocessingParms.logn_scaling = params.logn_scaling_ptr;
- preprocessingParms.seq_lens = params.spec_decoding_generation_lengths;
- preprocessingParms.cache_seq_lens = params.sequence_lengths;
- preprocessingParms.cu_seq_lens = cu_seqlens;
- preprocessingParms.rotary_embedding_inv_freq = rotary_inv_freq_buf;
- preprocessingParms.rotary_coef_cache_buffer = params.rotary_cos_sin;
- preprocessingParms.kvScaleOrigQuant = params.kv_scale_orig_quant;
- preprocessingParms.kv_cache_scale_factors = nullptr;
- preprocessingParms.spec_decoding_position_offsets = params.spec_decoding_position_offsets;
- preprocessingParms.mrope_position_deltas = params.mrope_position_deltas;
- // Scalar parameters.
- preprocessingParms.batch_size = int(batch_beam_size);
- preprocessingParms.max_input_seq_len = params.generation_input_length;
- preprocessingParms.max_kv_seq_len = params.max_past_kv_length;
- preprocessingParms.cyclic_kv_cache_len = params.cyclic_attention_window_size;
- preprocessingParms.sink_token_len = params.sink_token_length;
- preprocessingParms.token_num = params.total_num_input_tokens;
- preprocessingParms.remove_padding = true;
- preprocessingParms.cross_attention = false;
- preprocessingParms.head_num = params.num_q_heads;
- preprocessingParms.kv_head_num = params.num_kv_heads;
- preprocessingParms.qheads_per_kv_head = params.num_q_heads / params.num_kv_heads;
- preprocessingParms.size_per_head = params.head_size;
- preprocessingParms.fmha_host_bmm1_scale = 1.0f / (sqrtf(params.head_size * 1.0f) * params.q_scaling);
- preprocessingParms.rotary_embedding_dim = params.rotary_embedding_dim;
- preprocessingParms.rotary_embedding_base = params.rotary_embedding_base;
- preprocessingParms.rotary_scale_type = params.rotary_embedding_scale_type;
- preprocessingParms.rotary_embedding_scale = params.rotary_embedding_scale;
- preprocessingParms.rotary_embedding_max_positions = params.rotary_embedding_max_positions;
- preprocessingParms.position_embedding_type = params.position_embedding_type;
- preprocessingParms.position_shift_enabled = params.position_shift_enabled;
- preprocessingParms.cache_type = cache_type;
- preprocessingParms.separate_q_kv_output = true;
- preprocessingParms.quantized_fp8_output = is_fp8_q_input;
- preprocessingParms.generation_phase = true;
- preprocessingParms.multi_processor_count = mMultiProcessorCount;
- preprocessingParms.rotary_vision_start = params.rotary_vision_start;
- preprocessingParms.rotary_vision_length = params.rotary_vision_length;
+
+ auto preprocessingParms = makeQKVPreprocessingParams(params, launchParams, xqa_q_input_ptr,
+ mQDataType, cache_type, batch_beam_size, kv_cache_buffer, cu_seqlens, cu_kv_seqlens, rotary_inv_freq_buf,
+ mMultiProcessorCount);
invokeQKVPreprocessing(preprocessingParms, params.stream);
sync_check_cuda_error(params.stream);
@@ -394,7 +424,7 @@ void XqaDispatcher::runImpl(XQAParams params, KVCacheBuffer const& kv_cache_buff
= reinterpret_cast(launchParams.bmm1_scale_ptr + kIdxScaleSoftmaxLog2Ptr);
tllmRunnerParams.oSfScalePtr = params.fp4_out_sf_scale;
// The sequence lengths for K/V.
- tllmRunnerParams.seqLensKvPtr = params.sequence_lengths;
+ tllmRunnerParams.seqLensKvPtr = params.cross_attention ? params.encoder_input_lengths : params.sequence_lengths;
tllmRunnerParams.oPtr = params.output;
tllmRunnerParams.oSfPtr = params.output_sf;
diff --git a/cpp/tests/unit_tests/executor/transferAgentTest.cpp b/cpp/tests/unit_tests/executor/transferAgentTest.cpp
index c73d9a2140b..4745e8e40b1 100644
--- a/cpp/tests/unit_tests/executor/transferAgentTest.cpp
+++ b/cpp/tests/unit_tests/executor/transferAgentTest.cpp
@@ -255,7 +255,8 @@ TEST_F(TransferAgentTest, SyncMessage)
checked = nixlAgent0->checkRemoteDescs(agent1, regMem3.getDescs());
} while (!checked);
auto syncMessage = std::string("agent_sync_message");
- TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem3.getDescs(), agent1, syncMessage};
+ nixlAgent0->notifySyncMessage(agent1, syncMessage);
+ TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem3.getDescs(), agent1};
auto status = nixlAgent0->submitTransferRequests(writeReq);
auto notif = nixlAgent1->getNotifiedSyncMessages();
@@ -302,7 +303,8 @@ TEST_F(TransferAgentTest, SyncMessage)
} while (!checked2);
std::string syncMessage4 = "four_agent_sync_message";
- TransferRequest writeReq1{TransferOp::kWRITE, regMem2.getDescs(), regMem1.getDescs(), agent0, syncMessage4};
+ nixlAgent1->notifySyncMessage(agent0, syncMessage4);
+ TransferRequest writeReq1{TransferOp::kWRITE, regMem2.getDescs(), regMem1.getDescs(), agent0};
auto status1 = nixlAgent1->submitTransferRequests(writeReq1);
auto notif4 = nixlAgent0->getNotifiedSyncMessages();
for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif4.size() == 0; i++)
diff --git a/docker/common/install_nixl.sh b/docker/common/install_nixl.sh
index 18ee554f693..cecd61a7af4 100644
--- a/docker/common/install_nixl.sh
+++ b/docker/common/install_nixl.sh
@@ -4,8 +4,9 @@ set -ex
GITHUB_URL="https://github.com"
UCX_INSTALL_PATH="/usr/local/ucx/"
CUDA_PATH="/usr/local/cuda"
-NIXL_VERSION="0.3.1"
+NIXL_VERSION="0.5.0"
NIXL_REPO="https://github.com/ai-dynamo/nixl.git"
+OLD_LD_LIBRARY_PATH=$LD_LIBRARY_PATH
ARCH_NAME="x86_64-linux-gnu"
GDS_PATH="$CUDA_PATH/targets/x86_64-linux"
@@ -18,25 +19,26 @@ pip3 install --no-cache-dir meson ninja pybind11
git clone --depth 1 -b ${NIXL_VERSION} ${NIXL_REPO}
cd nixl
-cuda_path=$(find / -name "libcuda.so.1" 2>/dev/null | head -n1)
-if [[ -z "$cuda_path" ]]; then
- echo "libcuda.so.1 not found "
+CUDA_SO_PATH=$(find "/usr/local" -name "libcuda.so.1" 2>/dev/null | head -n1)
+
+if [[ -z "$CUDA_SO_PATH" ]]; then
+ echo "libcuda.so.1 not found"
exit 1
fi
-ln -sf $cuda_path $CUDA_PATH/lib64/libcuda.so.1
+CUDA_SO_PATH=$(dirname $CUDA_SO_PATH)
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_SO_PATH
meson setup builddir \
-Ducx_path=$UCX_INSTALL_PATH \
-Dcudapath_lib="$CUDA_PATH/lib64" \
-Dcudapath_inc="$CUDA_PATH/include" \
-Dgds_path="$GDS_PATH" \
- -Dinstall_headers=true \
- -Dstatic_plugins=UCX
+ -Dinstall_headers=true
cd builddir && ninja install
cd ../..
rm -rf nixl* # Remove NIXL source tree to save space
-rm $CUDA_PATH/lib64/libcuda.so.1
+export LD_LIBRARY_PATH=$OLD_LD_LIBRARY_PATH
echo "export LD_LIBRARY_PATH=/opt/nvidia/nvda_nixl/lib/${ARCH_NAME}:/opt/nvidia/nvda_nixl/lib64:\$LD_LIBRARY_PATH" >> "${ENV}"
diff --git a/docker/common/install_ucx.sh b/docker/common/install_ucx.sh
index 22f444d9746..ba35e82ce63 100644
--- a/docker/common/install_ucx.sh
+++ b/docker/common/install_ucx.sh
@@ -2,29 +2,28 @@
set -ex
GITHUB_URL="https://github.com"
-UCX_VERSION="v1.18.1"
+UCX_VERSION="v1.19.x"
UCX_INSTALL_PATH="/usr/local/ucx/"
CUDA_PATH="/usr/local/cuda"
UCX_REPO="https://github.com/openucx/ucx.git"
-if [ ! -d ${UCX_INSTALL_PATH} ]; then
- git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO}
- cd ucx
- ./autogen.sh
- ./contrib/configure-release \
- --prefix=${UCX_INSTALL_PATH} \
- --enable-shared \
- --disable-static \
- --disable-doxygen-doc \
- --enable-optimizations \
- --enable-cma \
- --enable-devel-headers \
- --with-cuda=${CUDA_PATH} \
- --with-verbs \
- --with-dm \
- --enable-mt
- make install -j$(nproc)
- cd ..
- rm -rf ucx # Remove UCX source to save space
- echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}"
-fi
+rm -rf ${UCX_INSTALL_PATH}
+git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO}
+cd ucx
+./autogen.sh
+./contrib/configure-release \
+ --prefix=${UCX_INSTALL_PATH} \
+ --enable-shared \
+ --disable-static \
+ --disable-doxygen-doc \
+ --enable-optimizations \
+ --enable-cma \
+ --enable-devel-headers \
+ --with-cuda=${CUDA_PATH} \
+ --with-verbs \
+ --with-dm \
+ --enable-mt
+make install -j$(nproc)
+cd ..
+rm -rf ucx # Remove UCX source to save space
+echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}"
diff --git a/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md b/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md
index 8f5c1dfec0f..87432173b42 100644
--- a/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md
+++ b/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md
@@ -19,11 +19,11 @@ We have a forthcoming guide for achieving great performance on H100; however, th
In this section, we introduce several ways to install TensorRT-LLM.
-### NGC Docker Image of dev branch
+### NGC Docker Image
-Day-0 support for gpt-oss is provided via the NGC container image `nvcr.io/nvidia/tensorrt-llm/release:gpt-oss-dev`. This image was built on top of the pre-day-0 **dev branch**. This container is multi-platform and will run on both x64 and arm64 architectures.
+Visit the [NGC TensorRT-LLM Release page](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release) to find the most up-to-date NGC container image to use. You can also check the latest [release notes](https://github.com/NVIDIA/TensorRT-LLM/releases) to keep track of the support status of the latest releases.
-Run the following docker command to start the TensorRT-LLM container in interactive mode:
+Run the following Docker command to start the TensorRT-LLM container in interactive mode (change the image tag to match latest release):
```bash
docker run --rm --ipc=host -it \
@@ -33,7 +33,7 @@ docker run --rm --ipc=host -it \
-p 8000:8000 \
-e TRTLLM_ENABLE_PDL=1 \
-v ~/.cache:/root/.cache:rw \
- nvcr.io/nvidia/tensorrt-llm/release:gpt-oss-dev \
+ nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc0 \
/bin/bash
```
@@ -53,9 +53,9 @@ Additionally, the container mounts your user `.cache` directory to save the down
Support for gpt-oss has been [merged](https://github.com/NVIDIA/TensorRT-LLM/pull/6645) into the **main branch** of TensorRT-LLM. As we continue to optimize gpt-oss performance, you can build TensorRT-LLM from source to get the latest features and support. Please refer to the [doc](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html) if you want to build from source yourself.
-### Regular Release of TensorRT-LLM
+### TensorRT-LLM Python Wheel Install
-Since gpt-oss has been supported on the main branch, you can get TensorRT-LLM out of the box through its regular release in the future. Please check the latest [release notes](https://github.com/NVIDIA/TensorRT-LLM/releases) to keep track of the support status. The release is provided as [NGC Container Image](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags) or [pip Python wheel](https://pypi.org/project/tensorrt-llm/#history). You can find instructions on pip install [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html).
+Regular releases of TensorRT-LLM are also provided as [Python wheels](https://pypi.org/project/tensorrt-llm/#history). You can find instructions on the pip install [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html).
## Performance Benchmarking and Model Serving
@@ -210,7 +210,10 @@ We can use `trtllm-serve` to serve the model by translating the benchmark comman
```bash
trtllm-serve \
- gpt-oss-120b \ # Or ${local_model_path}
+Note: You can also point to a local path containing the model weights instead of the HF repo (e.g., `${local_model_path}`).
+
+trtllm-serve \
+ openai/gpt-oss-120b \
--host 0.0.0.0 \
--port 8000 \
--backend pytorch \
@@ -228,7 +231,8 @@ For max-throughput configuration, run:
```bash
trtllm-serve \
- gpt-oss-120b \ # Or ${local_model_path}
+trtllm-serve \
+ openai/gpt-oss-120b \
--host 0.0.0.0 \
--port 8000 \
--backend pytorch \
@@ -262,7 +266,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '
"messages": [
{
"role": "user",
- "content": "What is NVIDIA's advantage for inference?"
+ "content": "What is NVIDIAs advantage for inference?"
}
],
"max_tokens": 1024,
@@ -348,12 +352,7 @@ others according to your needs.
## (H200/H100 Only) Using OpenAI Triton Kernels for MoE
-OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM can leverage these kernels for Hopper-based GPUs like NVIDIA's H200 for optimal performance. `TRTLLM` MoE backend is not supported on Hopper, and `CUTLASS` backend support is still ongoing. Please enable `TRITON` backend with the steps below if you are running on Hopper GPUs.
-
-### Installing OpenAI Triton
-
-The `nvcr.io/nvidia/tensorrt-llm/release:gpt-oss-dev` has prepared Triton already (`echo $TRITON_ROOT` could reveal the path). In other situations, you will need to build and install a specific version of Triton. Please follow the instructions in this [link](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/gpt_oss#using-openai-triton-kernels-for-moe).
-
+OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM can leverage these kernels for Hopper-based GPUs like NVIDIA's H200 for optimal performance. `TRTLLM` MoE backend is not supported on Hopper, and `CUTLASS` backend support is still ongoing. Please follow the instructions in this [link](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/gpt_oss#using-openai-triton-kernels-for-moe) to install and enable the `TRITON` MoE kernels on Hopper GPUs.
### Selecting Triton as the MoE backend
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 61ed4978d96..f43d454ac8a 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -1835,6 +1835,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
"H100_PCIe-TensorRT-Post-Merge-4": ["h100-cr", "l0_h100", 4, 5],
"H100_PCIe-TensorRT-Post-Merge-5": ["h100-cr", "l0_h100", 5, 5],
"B200_PCIe-Triton-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1],
+ "B200_PCIe-PyTorch-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1],
"H100_PCIe-TensorRT-Perf-1": ["h100-cr", "l0_perf", 1, 1],
"H100_PCIe-PyTorch-Perf-1": ["h100-cr", "l0_perf", 1, 1],
"DGX_H200-8_GPUs-PyTorch-Post-Merge-1": ["dgx-h200-x8", "l0_dgx_h200", 1, 1, 8],
diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties
index 751f2516358..bd46241e51d 100644
--- a/jenkins/current_image_tags.properties
+++ b/jenkins/current_image_tags.properties
@@ -11,7 +11,7 @@
#
# NB: Typically, the suffix indicates the PR whose CI pipeline generated the images. In case that
# images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
-LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508130930-6501
-LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508130930-6501
-LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202508130930-6501
-LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202508130930-6501
+LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
+LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
+LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
+LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
diff --git a/requirements.txt b/requirements.txt
index e2582f50385..a7821f15db6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cu128
-c constraints.txt
-accelerate>=0.25.0
+accelerate>=1.7.0
build
colored
cuda-python>=12,<13
diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py
index e40543c78f3..d7cd4c61f1a 100755
--- a/scripts/build_wheel.py
+++ b/scripts/build_wheel.py
@@ -329,10 +329,8 @@ def create_cuda_stub_links(cuda_stub_dir: str, missing_libs: list[str]) -> str:
return str(temp_dir_path)
-def check_missing_libs(so_prefix: str) -> list[str]:
- result = build_run(f"ldd {so_prefix}.cpython*.so",
- capture_output=True,
- text=True)
+def check_missing_libs(lib_name: str) -> list[str]:
+ result = build_run(f"ldd {lib_name}", capture_output=True, text=True)
missing = []
for line in result.stdout.splitlines():
if "not found" in line:
@@ -344,7 +342,7 @@ def check_missing_libs(so_prefix: str) -> list[str]:
def generate_python_stubs_linux(binding_type: str, venv_python: Path,
- deep_ep: bool):
+ deep_ep: bool, binding_lib_name: str):
is_nanobind = binding_type == "nanobind"
if is_nanobind:
build_run(f"\"{venv_python}\" -m pip install nanobind")
@@ -353,7 +351,7 @@ def generate_python_stubs_linux(binding_type: str, venv_python: Path,
env_stub_gen = os.environ.copy()
cuda_home_dir = env_stub_gen.get("CUDA_HOME") or env_stub_gen.get(
"CUDA_PATH") or "/usr/local/cuda"
- missing_libs = check_missing_libs("bindings")
+ missing_libs = check_missing_libs(binding_lib_name)
cuda_stub_dir = f"{cuda_home_dir}/lib64/stubs"
if missing_libs and Path(cuda_stub_dir).exists():
@@ -806,7 +804,9 @@ def get_binding_lib(subdirectory, name):
) == 1, f"Exactly one binding library should be present: {binding_lib}"
return binding_lib[0]
- install_file(get_binding_lib(binding_type, "bindings"), pkg_dir)
+ binding_lib_dir = get_binding_lib(binding_type, "bindings")
+ binding_lib_file_name = binding_lib_dir.name
+ install_file(binding_lib_dir, pkg_dir)
with (build_dir / "tensorrt_llm" / "deep_ep" /
"cuda_architectures.txt").open() as f:
@@ -846,7 +846,7 @@ def get_binding_lib(subdirectory, name):
else: # on linux
generate_python_stubs_linux(
binding_type, venv_python,
- bool(deep_ep_cuda_architectures))
+ bool(deep_ep_cuda_architectures), binding_lib_file_name)
if not skip_building_wheel:
if dist_dir is None:
diff --git a/tensorrt_llm/_torch/attention_backend/__init__.py b/tensorrt_llm/_torch/attention_backend/__init__.py
index c426b18f0a5..68805efa4a2 100644
--- a/tensorrt_llm/_torch/attention_backend/__init__.py
+++ b/tensorrt_llm/_torch/attention_backend/__init__.py
@@ -1,4 +1,4 @@
-from ..custom_ops import IS_FLASHINFER_AVAILABLE
+from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE
from .interface import AttentionBackend, AttentionMetadata
from .trtllm import AttentionInputType, TrtllmAttention, TrtllmAttentionMetadata
from .vanilla import VanillaAttention, VanillaAttentionMetadata
diff --git a/tensorrt_llm/_torch/attention_backend/utils.py b/tensorrt_llm/_torch/attention_backend/utils.py
index 911621f150b..b741ec37c73 100644
--- a/tensorrt_llm/_torch/attention_backend/utils.py
+++ b/tensorrt_llm/_torch/attention_backend/utils.py
@@ -1,7 +1,7 @@
from typing import Optional, Type
from ...models.modeling_utils import QuantConfig
-from . import IS_FLASHINFER_AVAILABLE
+from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE
from .interface import AttentionBackend, MLAParams, PositionalEmbeddingParams
from .trtllm import TrtllmAttention
from .vanilla import VanillaAttention
diff --git a/tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py b/tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py
index 00b535dec61..e0e21b1d70e 100644
--- a/tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py
+++ b/tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py
@@ -43,11 +43,13 @@ def _patch_unsupported_input_tensor():
"""
original_fn = lowering.unsupported_input_tensor
- def patched_fn(t: torch.Tensor, parent=None, node=None):
+ def patched_fn(t: torch.Tensor, *args, **kwargs):
"""Bypass meta tensor check."""
if t.is_meta:
return False
- return original_fn(t, parent, node)
+ return original_fn(
+ t, *args, **kwargs
+ ) # a generic pass-through of the arguments to accommodate torch side change
lowering.unsupported_input_tensor = patched_fn
try:
diff --git a/tensorrt_llm/_torch/custom_ops/__init__.py b/tensorrt_llm/_torch/custom_ops/__init__.py
index 8a81d1123a4..f8e46d3d29a 100644
--- a/tensorrt_llm/_torch/custom_ops/__init__.py
+++ b/tensorrt_llm/_torch/custom_ops/__init__.py
@@ -1,5 +1,6 @@
+from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE
+from ..modules.attention import attn_custom_op_inplace, mla_custom_op_inplace
from .cpp_custom_ops import _register_fake
-from .flashinfer_custom_ops import IS_FLASHINFER_AVAILABLE
from .torch_custom_ops import bmm_out
from .trtllm_gen_custom_ops import fp8_block_scale_moe_runner
from .userbuffers_custom_ops import add_to_ub, copy_to_userbuffers, matmul_to_ub
@@ -12,6 +13,8 @@
'add_to_ub',
'copy_to_userbuffers',
'matmul_to_ub',
+ 'attn_custom_op_inplace',
+ 'mla_custom_op_inplace',
]
if IS_FLASHINFER_AVAILABLE:
diff --git a/tensorrt_llm/_torch/custom_ops/flashinfer_custom_ops.py b/tensorrt_llm/_torch/custom_ops/flashinfer_custom_ops.py
index 15e4b3750f2..223c9bd5b04 100644
--- a/tensorrt_llm/_torch/custom_ops/flashinfer_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/flashinfer_custom_ops.py
@@ -1,35 +1,11 @@
-import os
-import platform
-import traceback
-
import torch
-from ...logger import logger
-
-IS_FLASHINFER_AVAILABLE = False
-
-
-def get_env_enable_pdl():
- return os.environ.get("TRTLLM_ENABLE_PDL", "0") == "1"
-
-
-ENABLE_PDL = get_env_enable_pdl()
-if ENABLE_PDL:
- logger.info("PDL is enabled")
-
-if platform.system() != "Windows":
- try:
- import flashinfer
- IS_FLASHINFER_AVAILABLE = True
- except ImportError:
- traceback.print_exc()
- print(
- "flashinfer is not installed properly, please try pip install or building from source codes"
- )
+from ..flashinfer_utils import ENABLE_PDL, IS_FLASHINFER_AVAILABLE
if IS_FLASHINFER_AVAILABLE:
from flashinfer.activation import silu_and_mul
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
+ from flashinfer.rope import apply_rope_with_cos_sin_cache_inplace
# Warp this into custom op since flashinfer didn't warp it properly and we want to avoid graph break between mlp layer for user buffer optimization
@torch.library.custom_op("trtllm::flashinfer_silu_and_mul", mutates_args=())
@@ -69,7 +45,7 @@ def flashinfer_apply_rope_with_cos_sin_cache_inplace(
cos_sin_cache: torch.Tensor,
is_neox: bool = True,
) -> None:
- flashinfer.apply_rope_with_cos_sin_cache_inplace(
+ apply_rope_with_cos_sin_cache_inplace(
positions,
query,
key,
diff --git a/tensorrt_llm/_torch/flashinfer_utils.py b/tensorrt_llm/_torch/flashinfer_utils.py
new file mode 100644
index 00000000000..5b150665b5b
--- /dev/null
+++ b/tensorrt_llm/_torch/flashinfer_utils.py
@@ -0,0 +1,27 @@
+import os
+import platform
+import traceback
+
+from ..logger import logger
+
+IS_FLASHINFER_AVAILABLE = False
+
+
+def get_env_enable_pdl():
+ return os.environ.get("TRTLLM_ENABLE_PDL", "0") == "1"
+
+
+ENABLE_PDL = get_env_enable_pdl()
+if ENABLE_PDL:
+ logger.info("PDL is enabled")
+
+if platform.system() != "Windows":
+ try:
+ import flashinfer
+ logger.info(f"flashinfer is available: {flashinfer.__version__}")
+ IS_FLASHINFER_AVAILABLE = True
+ except ImportError:
+ traceback.print_exc()
+ print(
+ "flashinfer is not installed properly, please try pip install or building from source codes"
+ )
diff --git a/tensorrt_llm/_torch/modules/rms_norm.py b/tensorrt_llm/_torch/modules/rms_norm.py
index 39787b82b7a..2a22d858250 100644
--- a/tensorrt_llm/_torch/modules/rms_norm.py
+++ b/tensorrt_llm/_torch/modules/rms_norm.py
@@ -19,7 +19,7 @@
import torch
from torch import nn
-from ..custom_ops import IS_FLASHINFER_AVAILABLE
+from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE
class RMSNorm(nn.Module):
diff --git a/tensorrt_llm/_torch/modules/rotary_embedding.py b/tensorrt_llm/_torch/modules/rotary_embedding.py
index 0b0d5582687..35bf7759d2e 100644
--- a/tensorrt_llm/_torch/modules/rotary_embedding.py
+++ b/tensorrt_llm/_torch/modules/rotary_embedding.py
@@ -3,8 +3,8 @@
import torch
from torch import nn
-from ..attention_backend import IS_FLASHINFER_AVAILABLE
from ..attention_backend.interface import RopeParams
+from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE
class RotaryEmbedding(nn.Module):
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 4cebfae58b1..a40b9b90459 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -1181,6 +1181,17 @@ def _forward_step_inter_pp(self, scheduled_batch) -> SampleState:
def _validate_request(self, request: LlmRequest):
if isinstance(self.model_engine.model, DecoderModelForCausalLM):
+ # Only skip token‐range checks for Llama4 when the request has multimodal data
+ from ..models.modeling_llama import Llama4ForConditionalGeneration
+ if isinstance(self.model_engine.model,
+ Llama4ForConditionalGeneration):
+ has_mm = bool(request.py_multimodal_data)
+ if has_mm:
+ logger.debug(
+ f"Skipping token-range validation for {type(self.model_engine.model).__name__} "
+ "(multimodal request)")
+ return
+
# FIXME: This check is necessary because of how Qwen2ForProcessRewardModel
# subclasses DecoderModelForCausalLM. Perhaps the functionality
# of DecoderModelForCausalLM reused by Qwen2ForProcessRewardModel
diff --git a/tensorrt_llm/commands/eval.py b/tensorrt_llm/commands/eval.py
index eff80d1a69d..32f4a420864 100644
--- a/tensorrt_llm/commands/eval.py
+++ b/tensorrt_llm/commands/eval.py
@@ -20,8 +20,8 @@
from .. import LLM as PyTorchLLM
from .._tensorrt_engine import LLM
-from ..evaluate import (GSM8K, MMLU, CnnDailymail, GPQADiamond, GPQAExtended,
- GPQAMain, JsonModeEval)
+from ..evaluate import (GSM8K, MMLU, MMMU, CnnDailymail, GPQADiamond,
+ GPQAExtended, GPQAMain, JsonModeEval)
from ..llmapi import BuildConfig, KvCacheConfig
from ..llmapi.llm_utils import update_llm_args_with_extra_options
from ..logger import logger, severity_map
@@ -152,6 +152,7 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str,
main.add_command(GPQAMain.command)
main.add_command(GPQAExtended.command)
main.add_command(JsonModeEval.command)
+main.add_command(MMMU.command)
if __name__ == "__main__":
main()
diff --git a/tensorrt_llm/evaluate/__init__.py b/tensorrt_llm/evaluate/__init__.py
index 58b789ecbef..89d87425338 100755
--- a/tensorrt_llm/evaluate/__init__.py
+++ b/tensorrt_llm/evaluate/__init__.py
@@ -15,10 +15,10 @@
from .cnn_dailymail import CnnDailymail
from .json_mode_eval import JsonModeEval
-from .lm_eval import GSM8K, GPQADiamond, GPQAExtended, GPQAMain
+from .lm_eval import GSM8K, MMMU, GPQADiamond, GPQAExtended, GPQAMain
from .mmlu import MMLU
__all__ = [
"CnnDailymail", "MMLU", "GSM8K", "GPQADiamond", "GPQAMain", "GPQAExtended",
- "JsonModeEval"
+ "JsonModeEval", "MMMU"
]
diff --git a/tensorrt_llm/evaluate/lm_eval.py b/tensorrt_llm/evaluate/lm_eval.py
index 920299b1030..c77e3be4479 100644
--- a/tensorrt_llm/evaluate/lm_eval.py
+++ b/tensorrt_llm/evaluate/lm_eval.py
@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
+import json
import os
from contextlib import contextmanager
from typing import Dict, Iterable, List, Optional, Tuple, Union
@@ -22,6 +23,7 @@
from tqdm import tqdm
import tensorrt_llm.profiler as profiler
+from tensorrt_llm.inputs import prompt_inputs
try:
from lm_eval.api.model import TemplateLM
@@ -31,11 +33,18 @@
from .. import LLM as PyTorchLLM
from .._tensorrt_engine import LLM
+from ..inputs import (ConversationMessage, MultimodalDataTracker,
+ add_multimodal_placeholders, convert_image_mode)
+from ..inputs.utils import apply_chat_template as trtllm_apply_chat_template
from ..llmapi import RequestOutput
from ..logger import logger
from ..sampling_params import SamplingParams
from .interface import Evaluator
+# NOTE: lm_eval uses "" as the default image placeholder
+# https://github.com/EleutherAI/lm-evaluation-harness/blob/7f04db12d2f8e7a99a0830d99eb78130e1ba2122/lm_eval/models/hf_vlms.py#L25
+LM_EVAL_DEFAULT_IMAGE_PLACEHOLDER = ""
+
class LmEvalWrapper(TemplateLM):
@@ -125,6 +134,163 @@ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
return [output.outputs[0].text for output in outputs]
+class MultimodalLmEvalWrapper(LmEvalWrapper):
+ """
+ Multimodal wrapper for lm-evaluation-harness that handles vision-language models.
+
+ This wrapper extends the base LmEvalWrapper to support multimodal inputs,
+ particularly for tasks that require both text and image processing.
+ """
+
+ def __init__(self,
+ llm: Union[LLM, PyTorchLLM],
+ sampling_params: Optional[SamplingParams] = None,
+ streaming: bool = False,
+ max_images: int = 999):
+ """
+ Initialize the multimodal wrapper.
+
+ Args:
+ llm: The language model instance (either TensorRT or PyTorch)
+ sampling_params: Parameters for text generation
+ streaming: Whether to use streaming generation
+ max_images: Maximum number of images per prompt (currently unlimited in TRT-LLM), set to 999 from lm_eval's default value.
+ """
+ super().__init__(llm, sampling_params, streaming)
+
+ # NOTE: Required by lm_eval to identify this as a multimodal model
+ self.MULTIMODAL = True
+ self.max_images = max_images
+ self.model_type = self._get_model_type(llm)
+
+ # NOTE: In TRT-LLM, currently we do not support interleaved text and image. Instead, we are adding image placeholders at the end of the text or at the beginning of the text.
+ # So, until we support interleaved text and image, we set this to False.
+ self.interleave = False
+
+ def _get_model_type(self, llm: Union[LLM, PyTorchLLM]) -> str:
+ """Extract model type from the model configuration."""
+ config_path = os.path.join(llm._hf_model_dir, 'config.json')
+
+ if not os.path.exists(config_path):
+ raise FileNotFoundError(
+ f"Model configuration file not found: {config_path}")
+
+ try:
+ with open(config_path, 'r') as f:
+ config = json.load(f)
+ except json.JSONDecodeError as e:
+ raise ValueError(
+ f"Invalid JSON in model configuration file {config_path}: {e}")
+
+ if 'model_type' not in config:
+ raise KeyError(
+ f"'model_type' key not found in model configuration: {config_path}"
+ )
+
+ return config['model_type']
+
+ def apply_chat_template(self,
+ chat_history: List[Dict[str, str]],
+ add_generation_prompt: bool = True) -> str:
+ """
+ Apply chat template to multimodal conversation history.
+
+ Converts text with image placeholders into structured format expected by
+ the multimodal processor.
+
+ Adapted from: https://github.com/EleutherAI/lm-evaluation-harness/blob/7f04db12d2f8e7a99a0830d99eb78130e1ba2122/lm_eval/models/hf_vlms.py#L225
+ """
+ mm_placeholder_counts = []
+ for i in range(len(chat_history)):
+ content = chat_history[i]
+ text = content["content"]
+ image_count = min(self.max_images,
+ text.count(LM_EVAL_DEFAULT_IMAGE_PLACEHOLDER))
+
+ if self.interleave:
+ # TODO: Implement interleaved text and image.
+ text.split(LM_EVAL_DEFAULT_IMAGE_PLACEHOLDER)
+ ...
+ else:
+ text = text.replace(LM_EVAL_DEFAULT_IMAGE_PLACEHOLDER, "")
+
+ conv = ConversationMessage(role="user", content=text)
+ mm_data_tracker = MultimodalDataTracker(self.model_type)
+
+ # NOTE: Since we already have loaded images, for the placeholder purpose, we add data here.
+ for _ in range(image_count):
+ mm_data_tracker.add_data("image", None)
+ mm_placeholder_count = mm_data_tracker.placeholder_counts()
+ if mm_placeholder_count:
+ # TODO: This is an assumption of not interleaving text and image. Need to extend to interleaved texts.
+ conv["content"] = add_multimodal_placeholders(
+ self.model_type, conv["content"], mm_placeholder_count)
+ mm_placeholder_counts.append(mm_placeholder_count)
+ chat_history[i] = conv
+
+ output = trtllm_apply_chat_template(
+ model_type=self.model_type,
+ tokenizer=self.llm.tokenizer,
+ processor=self.llm.input_processor.processor,
+ conversation=chat_history,
+ add_generation_prompt=add_generation_prompt,
+ mm_placeholder_counts=mm_placeholder_counts,
+ tools=None,
+ chat_template_kwargs={
+ "continue_final_message": not add_generation_prompt
+ })
+ return output
+
+ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+ """
+ Generate text responses for multimodal requests.
+
+ This method processes multimodal requests that include both text prompts
+ and visual data (images).
+
+ Args:
+ requests: List of multimodal generation requests
+ disable_tqdm: Whether to disable progress bars
+
+ Returns:
+ List of generated text responses
+ """
+ profiler.start("trtllm exec")
+ results = []
+ for request in tqdm(requests,
+ desc="Submitting requests",
+ disable=disable_tqdm):
+
+ # NOTE: For now, only this part is different from the original generate_until
+ prompt, gen_kwargs, media_data = request.args
+ prompt = prompt_inputs(prompt)
+
+ # NOTE: Convert RGBA format to RGB format
+ images = [
+ convert_image_mode(img, "RGB") for img in media_data["visual"]
+ ]
+ prompt["multi_modal_data"] = {"image": images}
+
+ sampling_params = self._get_sampling_params(gen_kwargs)
+ output = self.llm.generate_async(prompt,
+ sampling_params=sampling_params,
+ streaming=self.streaming)
+ results.append(output)
+
+ outputs = []
+ for output in tqdm(results,
+ desc="Fetching responses",
+ disable=disable_tqdm):
+ outputs.append(output.result())
+
+ profiler.stop("trtllm exec")
+ elapsed_time = profiler.elapsed_time_in_sec("trtllm exec")
+ logger.info(f"TRTLLM execution time: {elapsed_time:.3f} seconds.")
+ profiler.reset("trtllm exec")
+
+ return [output.outputs[0].text for output in outputs]
+
+
class LmEvalEvaluator(Evaluator):
def __init__(self,
@@ -134,7 +300,8 @@ def __init__(self,
random_seed: int = 0,
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
- system_prompt: Optional[str] = None):
+ system_prompt: Optional[str] = None,
+ is_multimodal: bool = False):
try:
import lm_eval
except ImportError as e:
@@ -143,6 +310,12 @@ def __init__(self,
"Please install the package first, e.g., `pip install lm_eval`."
) from e
import lm_eval.tasks
+ self.MULTIMODAL = is_multimodal
+ if self.MULTIMODAL:
+ apply_chat_template = True
+ logger.info(
+ "Chat template automatically enabled for multimodal evaluation."
+ )
super().__init__(random_seed=random_seed,
apply_chat_template=apply_chat_template,
fewshot_as_multiturn=fewshot_as_multiturn,
@@ -156,12 +329,31 @@ def __init__(self,
with self._patch_lm_eval():
self.task_dict = lm_eval.tasks.get_task_dict(
task_name, task_manager=task_manager)
- # Few-shot random seed
- self.task_dict[self.task_name].set_fewshot_seed(random_seed)
- # Shuffle dataset
- data = self.task_dict[self.task_name].dataset
- for split in data.keys():
- data[split] = data[split].shuffle(random_seed)
+
+ # Adopted from https://github.com/EleutherAI/lm-evaluation-harness/blob/7f04db12d2f8e7a99a0830d99eb78130e1ba2122/lm_eval/evaluator.py#L290
+ def _adjust_config(task_dict, random_seed):
+ adjusted_task_dict = {}
+ for task_name, task_obj in task_dict.items():
+ if isinstance(task_obj, dict):
+ adjusted_task_dict = {
+ **adjusted_task_dict,
+ **{
+ task_name: _adjust_config(task_obj, random_seed)
+ },
+ }
+ else:
+ # NOTE: Few-shot random seed
+ task_obj.set_fewshot_seed(seed=random_seed)
+ adjusted_task_dict[task_name] = task_obj
+
+ # NOTE: Shuffle dataset
+ data = adjusted_task_dict[task_name].dataset
+ for split in data.keys():
+ data[split] = data[split].shuffle(random_seed)
+
+ return adjusted_task_dict
+
+ self.task_dict = _adjust_config(self.task_dict, random_seed)
@contextmanager
def _patch_lm_eval(self):
@@ -196,8 +388,9 @@ def evaluate(self,
streaming: bool = False,
scores_filter: str = None) -> float:
import lm_eval
+ lm_cls = MultimodalLmEvalWrapper if self.MULTIMODAL else LmEvalWrapper
results = lm_eval.evaluate(
- lm=LmEvalWrapper(llm, sampling_params, streaming),
+ lm=lm_cls(llm, sampling_params, streaming),
task_dict=self.task_dict,
limit=self.num_samples,
apply_chat_template=self.apply_chat_template,
@@ -226,6 +419,7 @@ def evaluate(self,
@classmethod
def command_harness(cls, ctx, **kwargs):
llm: Union[LLM, PyTorchLLM] = ctx.obj
+
evaluator = cls(dataset_path=kwargs.pop("dataset_path", None),
num_samples=kwargs.pop("num_samples", None),
random_seed=kwargs.pop("random_seed", 0),
@@ -233,10 +427,12 @@ def command_harness(cls, ctx, **kwargs):
False),
fewshot_as_multiturn=kwargs.pop("fewshot_as_multiturn",
False),
- system_prompt=kwargs.pop("system_prompt", None))
+ system_prompt=kwargs.pop("system_prompt", None),
+ is_multimodal=kwargs.pop("is_multimodal", False))
sampling_params = SamplingParams(
max_tokens=kwargs.pop("max_output_length"),
- truncate_prompt_tokens=kwargs.pop("max_input_length"))
+ truncate_prompt_tokens=kwargs.pop("max_input_length"),
+ stop=kwargs.pop("stop", None))
evaluator.evaluate(llm, sampling_params)
llm.shutdown()
@@ -419,3 +615,52 @@ def __init__(self, **kwargs):
@staticmethod
def command(ctx, **kwargs) -> None:
GPQAExtended.command_harness(ctx, **kwargs)
+
+
+class MMMU(LmEvalEvaluator):
+
+ def __init__(self, **kwargs):
+ super().__init__("mmmu_val", **kwargs)
+
+ @click.command("mmmu")
+ @click.option("--dataset_path",
+ type=str,
+ default=None,
+ help="The path to MMMU dataset. "
+ "If unspecified, the dataset is downloaded from HF hub.")
+ @click.option(
+ "--num_samples",
+ type=int,
+ default=None,
+ help="Number of samples to run the evaluation; None means full dataset."
+ )
+ @click.option("--random_seed",
+ type=int,
+ default=0,
+ help="Random seed for dataset processing.")
+ @click.option(
+ "--system_prompt",
+ type=str,
+ default=None,
+ help=
+ "The system prompt to be added on the prompt. If specified, it will add {'role': 'system', 'content': system_prompt} to the prompt."
+ )
+ @click.option("--max_input_length",
+ type=int,
+ default=8192,
+ help="Maximum prompt length.")
+ @click.option(
+ "--max_output_length",
+ type=int,
+ default=
+ 512, # NOTE: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mmmu/_template_yaml#L13
+ help="Maximum generation length.")
+ @click.pass_context
+ @staticmethod
+ def command(ctx, **kwargs) -> None:
+ # NOTE: MMMU is a multimodal task, so we need to set the is_multimodal and apply_chat_template flags to True
+ kwargs["is_multimodal"] = True
+ kwargs["apply_chat_template"] = True
+ kwargs[
+ "stop"] = "<|endoftext|>" # NOTE: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mmmu/_template_yaml#L10
+ MMMU.command_harness(ctx, **kwargs)
diff --git a/tensorrt_llm/inputs/__init__.py b/tensorrt_llm/inputs/__init__.py
index f7e5ce97d7f..070b8449cee 100644
--- a/tensorrt_llm/inputs/__init__.py
+++ b/tensorrt_llm/inputs/__init__.py
@@ -8,9 +8,9 @@
from .utils import (ALL_SUPPORTED_AUDIO_MODELS, ALL_SUPPORTED_IMAGE_MODELS,
ALL_SUPPORTED_MULTIMODAL_MODELS, ALL_SUPPORTED_VIDEO_MODELS,
ConversationMessage, MultimodalData, MultimodalDataTracker,
- add_multimodal_placeholders, async_load_audio,
- async_load_image, async_load_video,
- default_multimodal_input_loader,
+ add_multimodal_placeholders, apply_chat_template,
+ async_load_audio, async_load_image, async_load_video,
+ convert_image_mode, default_multimodal_input_loader,
encode_base64_content_from_url, load_image, load_video)
__all__ = [
@@ -37,6 +37,8 @@
"async_load_image",
"async_load_video",
"add_multimodal_placeholders",
+ "apply_chat_template",
+ "convert_image_mode",
"default_multimodal_input_loader",
"encode_base64_content_from_url",
"load_image",
diff --git a/tensorrt_llm/inputs/utils.py b/tensorrt_llm/inputs/utils.py
index 3b856a2bfbd..a5e0c0a5a42 100644
--- a/tensorrt_llm/inputs/utils.py
+++ b/tensorrt_llm/inputs/utils.py
@@ -72,11 +72,14 @@ def load_base64_image(parsed_url: str) -> Image.Image:
return image
-def load_image(image: str,
+def load_image(image: Union[str, Image.Image],
format: str = "pt",
device: str = "cpu") -> Union[Image.Image, torch.Tensor]:
assert format in ["pt", "pil"], "format must be either Pytorch or PIL"
+ if isinstance(image, Image.Image):
+ return image.convert('RGB')
+
parsed_url = urlparse(image)
if parsed_url.scheme in ["http", "https"]:
@@ -94,11 +97,14 @@ def load_image(image: str,
async def async_load_image(
- image: str,
+ image: Union[str, Image.Image],
format: str = "pt",
device: str = "cpu") -> Union[Image.Image, torch.Tensor]:
assert format in ["pt", "pil"], "format must be either Pytorch or PIL"
+ if isinstance(image, Image.Image):
+ return image.convert('RGB')
+
parsed_url = urlparse(image)
if parsed_url.scheme in ["http", "https"]:
@@ -386,12 +392,13 @@ def resolve_hf_chat_template(
def handle_placeholder_exceptions(model_type: str,
conversation: list[ConversationMessage],
- mm_placeholder_counts: dict[str, int]):
+ mm_placeholder_counts: list[dict[str, int]]):
if model_type == "llava_next":
# we need to convert the flattened content back to conversation format
- for conv in conversation:
+ for conv, mm_placeholder_count in zip(conversation,
+ mm_placeholder_counts):
conv["content"] = [{"type": "text", "text": conv["content"]}, \
- *[{"type": "image"} for _ in mm_placeholder_counts]]
+ *[{"type": "image"} for _ in range(mm_placeholder_count[''])]]
else:
raise ValueError(f"This path should not be reached for: {model_type}")
return conversation
@@ -572,7 +579,10 @@ def convert_to_conversation_message(
# Check if mdata is a MultimodalData
if isinstance(mdata,
dict) and "modality" in mdata and "data" in mdata:
- mm_data_tracker.add_data(mdata["modality"], mdata["data"])
+ modality = mdata["modality"]
+ if modality == "multiple_image":
+ modality = "image"
+ mm_data_tracker.add_data(modality, mdata["data"])
else:
# Add embeddings to the tracker for placeholder handling
mm_data_tracker.add_data(mdata["modality"],
diff --git a/tensorrt_llm/serve/chat_utils.py b/tensorrt_llm/serve/chat_utils.py
index ec67d469bc0..687799fb10a 100644
--- a/tensorrt_llm/serve/chat_utils.py
+++ b/tensorrt_llm/serve/chat_utils.py
@@ -179,6 +179,7 @@ def parse_chat_messages_coroutines(
Any, Any, Optional[Dict[str, List[Any]]]]]]:
"""Parse multiple chat messages and return conversation and coroutine."""
conversation = []
+ mm_placeholder_counts = []
mm_data_tracker = MultimodalDataTracker(model_config.model_type)
for msg in messages:
@@ -187,11 +188,12 @@ def parse_chat_messages_coroutines(
if parsed_msg["media"]:
for mdata in parsed_msg["media"]:
mm_data_tracker.add_data(mdata["modality"], mdata["data"])
- mm_placeholder_counts = mm_data_tracker.placeholder_counts()
- if mm_placeholder_counts:
- parsed_msg["content"] = add_multimodal_placeholders(
- model_config.model_type, parsed_msg["content"],
- mm_placeholder_counts)
+ mm_placeholder_count = mm_data_tracker.placeholder_counts()
+ if mm_placeholder_count:
+ parsed_msg["content"] = add_multimodal_placeholders(
+ model_config.model_type, parsed_msg["content"],
+ mm_placeholder_count)
+ mm_placeholder_counts.append(mm_placeholder_count)
return conversation, mm_data_tracker.retrieve_all_async(
), mm_placeholder_counts
diff --git a/tests/integration/defs/.test_durations b/tests/integration/defs/.test_durations
index 23a7d075d94..c85dc65bfe7 100644
--- a/tests/integration/defs/.test_durations
+++ b/tests/integration/defs/.test_durations
@@ -138,7 +138,16 @@
"disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-True-DeepSeek-V3-Lite-fp8/fp8]": 67.32832619687542,
"disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_llama[True-False-TinyLlama-1.1B-Chat-v1.0]": 46.302398771978915,
"disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_llama[True-True-TinyLlama-1.1B-Chat-v1.0]": 38.81214914191514,
- "test_unittests.py::test_unittests_v2[unittest/_torch -k \"not (modeling or multi_gpu or auto_deploy)\"]": 1186.6702785710804,
+ "test_unittests.py::test_unittests_v2[unittest/_torch/attention]": 588.56,
+ "test_unittests.py::test_unittests_v2[unittest/_torch/compilation]": 31.94,
+ "test_unittests.py::test_unittests_v2[unittest/_torch/debugger]": 36.69,
+ "test_unittests.py::test_unittests_v2[unittest/_torch/executor]": 170.86,
+ "test_unittests.py::test_unittests_v2[unittest/_torch/misc]": 600.50,
+ "test_unittests.py::test_unittests_v2[unittest/_torch/modules]": 158.50,
+ "test_unittests.py::test_unittests_v2[unittest/_torch/multimodal]": 23.54,
+ "test_unittests.py::test_unittests_v2[unittest/_torch/sampler]": 107.66,
+ "test_unittests.py::test_unittests_v2[unittest/_torch/speculative]": 1850.16,
+ "test_unittests.py::test_unittests_v2[unittest/_torch/thop]": 852.56,
"test_unittests.py::test_unittests_v2[unittest/_torch/modeling -k \"modeling_mixtral\"]": 208.1838396479725,
"test_unittests.py::test_unittests_v2[unittest/_torch/multi_gpu_modeling -k \"deepseek\"]": 393.0210295501165,
"cpp/test_e2e.py::test_model[-gpt_executor-80]": 4016.7569622844458,
@@ -238,7 +247,7 @@
"disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_llama[False-False-TinyLlama-1.1B-Chat-v1.0]": 48.16434509307146,
"test_e2e.py::test_trtllm_bench_iteration_log[PyTorch-non-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]": 163.86223009089008,
"test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-instruct-hf-fp8-True-True]": 115.74023819994181,
- "test_unittests.py::test_unittests_v2[unittest/_torch -k \"modeling_llama\"]": 718.749935634085,
+ "test_unittests.py::test_unittests_v2[unittest/_torch/modeling -k \"modeling_llama\"]": 718.749935634085,
"accuracy/test_cli_flow.py::TestGpt2::test_int8_kv_cache": 399.65961667895317,
"accuracy/test_cli_flow.py::TestLlama3_2_1B::test_int4_awq_int8_kv_cache": 392.90223736315966,
"accuracy/test_cli_flow.py::TestQwen2_7BInstruct::test_int4_awq_prequantized": 604.7383968606591,
@@ -280,7 +289,7 @@
"disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0]": 67.3897166326642,
"disaggregated/test_disaggregated.py::test_disaggregated_overlap[TinyLlama-1.1B-Chat-v1.0]": 98.97588296607137,
"disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]": 67.9668476767838,
- "test_unittests.py::test_unittests_v2[unittest/_torch/test_attention_mla.py]": 26.32902159006335,
+ "test_unittests.py::test_unittests_v2[unittest/_torch/attention/test_attention_mla.py]": 26.32902159006335,
"accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]": 591.2785023800097,
"accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]": 306.84709841990843,
"accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]": 220.57452515885234,
@@ -292,7 +301,6 @@
"test_e2e.py::test_ptp_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B]": 109.26379436196294,
"test_e2e.py::test_ptp_quickstart_advanced_mixed_precision": 80.88908524392173,
"test_e2e.py::test_ptp_quickstart_advanced_mtp[DeepSeek-V3-Lite-BF16-DeepSeek-V3-Lite/bf16]": 99.42739840806462,
- "test_unittests.py::test_unittests_v2[unittest/_torch/speculative/test_eagle3.py]": 317.8708840459585,
"accuracy/test_cli_flow.py::TestLlama7B::test_auto_dtype": 402.75543826818466,
"examples/test_bert.py::test_llm_bert_general[compare_hf-disable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:1-pp:1-float32-BertModel-bert/bert-base-uncased]": 111.17977902293205,
"examples/test_mamba.py::test_llm_mamba_1gpu[mamba-130m-float16-enable_gemm_plugin]": 112.04011878371239,
diff --git a/tests/integration/defs/accuracy/accuracy_core.py b/tests/integration/defs/accuracy/accuracy_core.py
index 35234e42ef6..06303886292 100644
--- a/tests/integration/defs/accuracy/accuracy_core.py
+++ b/tests/integration/defs/accuracy/accuracy_core.py
@@ -342,6 +342,26 @@ class JsonModeEval(AccuracyTask):
apply_chat_template=True)
+class MMMU(AccuracyTask):
+ DATASET = "mmmu"
+ DATASET_DIR = f"{llm_models_root()}/datasets/MMMU"
+
+ ALPHA = 0.05
+ BETA = 0.2
+ SIGMA = 50
+ NUM_SAMPLES = 900
+
+ MAX_BATCH_SIZE = 128
+ MAX_INPUT_LEN = 8192
+ MAX_OUTPUT_LEN = 512
+
+ EVALUATOR_CLS = tensorrt_llm.evaluate.MMMU
+ EVALUATOR_KWARGS = dict(dataset_path=DATASET_DIR,
+ random_seed=0,
+ is_multimodal=True,
+ apply_chat_template=True)
+
+
class PassKeyRetrieval64k(AccuracyTask):
DATASET = "passkey_retrieval_64k"
LEVEL = 3
diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml
new file mode 100644
index 00000000000..6edc728210b
--- /dev/null
+++ b/tests/integration/defs/accuracy/references/mmmu.yaml
@@ -0,0 +1,2 @@
+Qwen/Qwen2-VL-7B-Instruct:
+ - accuracy: 48.44
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index f0a8e923289..8879904627e 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -31,7 +31,7 @@
parametrize_with_ids, skip_no_hopper,
skip_post_blackwell, skip_pre_ada, skip_pre_blackwell,
skip_pre_hopper)
-from .accuracy_core import (GSM8K, MMLU, CnnDailymail, GPQADiamond,
+from .accuracy_core import (GSM8K, MMLU, MMMU, CnnDailymail, GPQADiamond,
JsonModeEval, LlmapiAccuracyTestHarness)
@@ -1734,6 +1734,7 @@ class TestKimiK2(LlmapiAccuracyTestHarness):
MODEL_PATH = f"{llm_models_root()}/Kimi-K2-Instruct"
@pytest.mark.skip_less_mpi_world_size(8)
+ @skip_post_blackwell
@skip_pre_hopper
@pytest.mark.parametrize(
"tp_size,pp_size,ep_size,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size",
@@ -2284,7 +2285,10 @@ def test_nvfp4(
pipeline_parallel_size=pp_size,
moe_expert_parallel_size=ep_size,
**pytorch_config,
- enable_attention_dp=attention_dp) as llm:
+ enable_attention_dp=attention_dp,
+ max_batch_size=32) as llm:
+ task = MMLU(self.MODEL_NAME)
+ task.evaluate(llm)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)
@@ -2442,11 +2446,12 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
[
(8, 1, 8, True, True, True, "CUTLASS", False),
(8, 1, 8, True, True, True, "TRTLLM", False),
- (8, 1, 8, False, False, False, "TRTLLM", True),
+ (8, 1, 8, True, True, True, "TRTLLM", True),
],
ids=[
- "latency_moe_cutlass", "latency_moe_trtllm",
- "latency_moe_trtllm_eagle3"
+ "latency_moe_cutlass",
+ "latency_moe_trtllm",
+ "latency_moe_trtllm_eagle3",
],
)
def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
@@ -2481,6 +2486,50 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)
+ @skip_pre_blackwell
+ @pytest.mark.skip_less_mpi_world_size(4)
+ @pytest.mark.parametrize(
+ "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3",
+ [
+ (4, 1, 4, False, False, False, "TRTLLM",
+ True), # TP8 has bug when we use TRTLLM moe backend and eagle3
+ ],
+ ids=[
+ "latency_moe_trtllm_eagle3",
+ ],
+ )
+ def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
+ cuda_graph, overlap_scheduler, moe_backend, eagle3):
+
+ pytorch_config = dict(
+ disable_overlap_scheduler=not overlap_scheduler,
+ cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
+ moe_config=MoeConfig(backend=moe_backend))
+
+ kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
+ enable_block_reuse=not eagle3)
+ spec_config = None
+ if eagle3:
+ spec_config = EagleDecodingConfig(
+ max_draft_len=2,
+ speculative_model_dir=
+ f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/",
+ eagle3_one_model=True)
+ with LLM(
+ f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
+ tensor_parallel_size=tp_size,
+ pipeline_parallel_size=pp_size,
+ moe_expert_parallel_size=ep_size,
+ **pytorch_config,
+ enable_attention_dp=attention_dp,
+ kv_cache_config=kv_cache_config,
+ speculative_config=spec_config) as llm:
+
+ task = MMLU(self.MODEL_NAME)
+ task.evaluate(llm)
+ task = GSM8K(self.MODEL_NAME)
+ task.evaluate(llm)
+
class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness):
MODEL_NAME = "microsoft/Phi-4-mini-instruct"
@@ -2690,3 +2739,22 @@ def test_auto_dtype(self):
task.evaluate(llm)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)
+
+
+class TestQwen2_VL_7B(LlmapiAccuracyTestHarness):
+ MODEL_NAME = "Qwen/Qwen2-VL-7B-Instruct"
+ MODEL_PATH = f"{llm_models_root()}/Qwen2-VL-7B-Instruct"
+
+ # NOTE: MMMU adds <|endoftext|> to the stop token.
+ sampling_params = SamplingParams(max_tokens=MMMU.MAX_OUTPUT_LEN,
+ truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
+ stop="<|endoftext|>")
+
+ kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
+
+ def test_auto_dtype(self):
+ with LLM(self.MODEL_PATH,
+ max_num_tokens=16384,
+ kv_cache_config=self.kv_cache_config) as llm:
+ task = MMMU(self.MODEL_NAME)
+ task.evaluate(llm, sampling_params=self.sampling_params)
diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt
index c427dea2bc7..e28f1bcecd4 100644
--- a/tests/integration/test_lists/qa/llm_function_full.txt
+++ b/tests/integration/test_lists/qa/llm_function_full.txt
@@ -579,7 +579,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
-accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
+accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
@@ -589,6 +589,7 @@ accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestQwen2_VL_7B::test_auto_dtype
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
@@ -630,7 +631,6 @@ test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-S
test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False]
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video-False]
-test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False]
test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image-False]
test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-video-False]
test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-image-False]
diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt
index c977a77d3c2..51c452cbc78 100644
--- a/tests/integration/test_lists/qa/llm_function_sanity.txt
+++ b/tests/integration/test_lists/qa/llm_function_sanity.txt
@@ -116,7 +116,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True]
-accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
+accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM]
@@ -164,7 +164,6 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Lla
test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B]
test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
-test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False]
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False]
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
diff --git a/tests/integration/test_lists/test-db/l0_a30.yml b/tests/integration/test_lists/test-db/l0_a30.yml
index 5ec16996e7c..06ccf6fb2ad 100644
--- a/tests/integration/test_lists/test-db/l0_a30.yml
+++ b/tests/integration/test_lists/test-db/l0_a30.yml
@@ -19,7 +19,7 @@ l0_a30:
- unittest/_torch/modeling -k "modeling_qwen"
- unittest/_torch/modeling -k "modeling_qwen_moe"
- unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison"
- - unittest/_torch/test_beam_search.py
+ - unittest/_torch/sampler/test_beam_search.py
- condition:
ranges:
system_gpu_count:
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
index cb36129a147..ae0d0bd0411 100644
--- a/tests/integration/test_lists/test-db/l0_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -20,22 +20,10 @@ l0_b200:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True]
- - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
- - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
- - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
- - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
- - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
- - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
- - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
- - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
- - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=nvfp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
@@ -66,15 +54,21 @@ l0_b200:
- test_e2e.py::test_ptp_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B]
- test_e2e.py::test_ptp_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct]
- test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-False-False]
- - unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" TIMEOUT (120)
- - unittest/_torch -k "modeling_llama"
+ - unittest/_torch/attention
+ - unittest/_torch/compilation
+ - unittest/_torch/debugger
+ - unittest/_torch/executor
+ - unittest/_torch/misc
+ - unittest/_torch/modules
+ - unittest/_torch/multimodal
+ - unittest/_torch/sampler
+ - unittest/_torch/speculative
+ - unittest/_torch/thop
+ - unittest/_torch/modeling -k "modeling_llama"
- unittest/_torch/modeling -k "modeling_mixtral"
- unittest/_torch/modeling -k "modeling_deepseek"
- unittest/_torch/modeling -k "modeling_gpt_oss"
- unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison"
- - unittest/_torch/speculative/test_eagle3.py
- - unittest/_torch/speculative/test_kv_cache_reuse.py
- - unittest/_torch/speculative/test_dynamic_spec_decode.py
- condition:
ranges:
system_gpu_count:
@@ -100,7 +94,6 @@ l0_b200:
- unittest/trt/attention/test_gpt_attention.py -k "trtllm_gen"
- unittest/llmapi/test_llm_quant.py # 3.5 mins on B200
- unittest/trt/functional/test_fp4_gemm.py # 3 mins on B200
-
- condition:
ranges:
system_gpu_count:
@@ -117,3 +110,29 @@ l0_b200:
- triton_server/test_triton.py::test_llava[llava]
- triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning]
- triton_server/test_triton.py::test_gpt_2b_ib_lora[gpt-2b-ib-lora]
+- condition:
+ ranges:
+ system_gpu_count:
+ gte: 1
+ lte: 1
+ wildcards:
+ gpu:
+ - '*b100*'
+ linux_distribution_name: ubuntu*
+ terms:
+ stage: post_merge
+ backend: pytorch
+ tests:
+ # ------------- PyTorch tests ---------------
+ - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+ - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
+ - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+ - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+ - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+ - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+ - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
+ - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]
+ - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
+ - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
+ - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
+ - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
diff --git a/tests/integration/test_lists/test-db/l0_gb200.yml b/tests/integration/test_lists/test-db/l0_gb200.yml
index ac39fbdc88c..7d1cc92fef5 100644
--- a/tests/integration/test_lists/test-db/l0_gb200.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200.yml
@@ -69,3 +69,4 @@ l0_gb200:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]
+ - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] TIMEOUT (90)
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
index d4d2745e3f6..857319c44c2 100644
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
@@ -17,6 +17,5 @@ l0_gb200_multi_nodes:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180)
- - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (180)
- - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (180)
- - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] TIMEOUT (180)
+ - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90)
+ - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90)
diff --git a/tests/integration/test_lists/test-db/l0_gb202.yml b/tests/integration/test_lists/test-db/l0_gb202.yml
index 3026d6e449d..4d2638e95f5 100644
--- a/tests/integration/test_lists/test-db/l0_gb202.yml
+++ b/tests/integration/test_lists/test-db/l0_gb202.yml
@@ -20,7 +20,7 @@ l0_gb202:
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[dtype0]
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[dtype1]
# - unittest/_torch/modeling -k "modeling_qwen" # https://nvbugs/5234573
- - unittest/_torch/test_attention_mla.py
+ - unittest/_torch/attention/test_attention_mla.py
- test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
- test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[latency]
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
index a52b515e644..bc840823171 100644
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -14,14 +14,23 @@ l0_h100:
backend: pytorch
tests:
# ------------- PyTorch tests ---------------
+ - unittest/_torch/attention
+ - unittest/_torch/compilation
+ - unittest/_torch/debugger
+ - unittest/_torch/executor
+ - unittest/_torch/misc
+ - unittest/_torch/modules
+ - unittest/_torch/multimodal
+ - unittest/_torch/sampler
+ - unittest/_torch/speculative
+ - unittest/_torch/thop
# Only key models in H100: llama/mixtral/nemotron/deepseek
- - unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py::test_trtllm_bench_backend_comparison
- - unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" TIMEOUT (90)
- - unittest/_torch -k "modeling_llama"
+ - unittest/_torch/modeling -k "modeling_llama"
- unittest/_torch/modeling -k "modeling_mixtral"
- unittest/_torch/modeling -k "modeling_nemotron"
- unittest/_torch/modeling -k "modeling_gemma3"
- unittest/_torch/modeling -k "modeling_gpt_oss"
+ - unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py::test_trtllm_bench_backend_comparison
- unittest/disaggregated/test_disagg_utils.py
- unittest/disaggregated/test_router.py
- unittest/disaggregated/test_remoteDictionary.py
diff --git a/tests/integration/test_lists/test-db/l0_l40s.yml b/tests/integration/test_lists/test-db/l0_l40s.yml
index bfb7ed58ea8..622080941b2 100644
--- a/tests/integration/test_lists/test-db/l0_l40s.yml
+++ b/tests/integration/test_lists/test-db/l0_l40s.yml
@@ -37,6 +37,7 @@ l0_l40s:
- test_e2e.py::test_openai_chat_with_logit_bias[torch_sampler]
- test_e2e.py::test_openai_completions_with_logit_bias[trtllm_sampler]
- test_e2e.py::test_openai_chat_with_logit_bias[trtllm_sampler]
+ - accuracy/test_llm_api_pytorch.py::TestQwen2_VL_7B::test_auto_dtype
- condition:
ranges:
system_gpu_count:
diff --git a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
index 32a03fd591c..49c29329dc7 100644
--- a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
+++ b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
@@ -17,7 +17,7 @@ l0_rtx_pro_6000:
- unittest/_torch/modeling -k "modeling_mllama"
- unittest/_torch/modeling -k "modeling_out_of_tree"
# - unittest/_torch/modeling -k "modeling_qwen" # https://nvbugs/5234573
- - unittest/_torch/test_attention_mla.py
+ - unittest/_torch/attention/test_attention_mla.py
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[dtype0]
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[dtype1]
- test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index aed69261062..86af57819b3 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -263,7 +263,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5434451)
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451)
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451)
-accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437405,https://nvbugs/5437384)
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 SKIP (https://nvbugs/5440241)
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095)
test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095)
@@ -287,13 +286,9 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=False] SKIP (https://nvbugs/5403818)
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5403818)
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8] SKIP (https://nvbugs/5453992)
-accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] SKIP (https://nvbugs/5454898)
-accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5454898)
+accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype SKIP (https://nvbugs/5454875)
examples/test_llm_api_with_mpi.py::test_llm_api_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5434372)
triton_server/test_triton.py::test_gpt_ib[gpt-ib] SKIP (https://nvbugs/5431116)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] SKIP (https://nvbugs/5457489)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5457489)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5457489)
disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5457504)
accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8 SKIP (https://nvbugs/5413197)
triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (https://nvbugs/5371349)
@@ -319,3 +314,8 @@ disaggregated/test_disaggregated.py::test_disaggregated_diff_max_tokens[TinyLlam
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5465642)
examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5431146)
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] SKIP (https://nvbugs/5464461)
+disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5448449)
+full:H100/accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=True] SKIP (https://nvbugs/5467815)
+full:H100/accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=False] SKIP (https://nvbugs/5467815)
+full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True] SKIP (https://nvbugs/5467815)
+full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True] SKIP (https://nvbugs/5467815)
diff --git a/tests/unittest/_torch/test_attention.py b/tests/unittest/_torch/attention/test_attention.py
similarity index 100%
rename from tests/unittest/_torch/test_attention.py
rename to tests/unittest/_torch/attention/test_attention.py
diff --git a/tests/unittest/_torch/test_attention_mla.py b/tests/unittest/_torch/attention/test_attention_mla.py
similarity index 100%
rename from tests/unittest/_torch/test_attention_mla.py
rename to tests/unittest/_torch/attention/test_attention_mla.py
diff --git a/tests/unittest/_torch/test_attention_no_cache.py b/tests/unittest/_torch/attention/test_attention_no_cache.py
similarity index 100%
rename from tests/unittest/_torch/test_attention_no_cache.py
rename to tests/unittest/_torch/attention/test_attention_no_cache.py
diff --git a/tests/unittest/_torch/test_flashinfer_attention.py b/tests/unittest/_torch/attention/test_flashinfer_attention.py
similarity index 100%
rename from tests/unittest/_torch/test_flashinfer_attention.py
rename to tests/unittest/_torch/attention/test_flashinfer_attention.py
diff --git a/tests/unittest/_torch/test_flashinfer_star_attn.py b/tests/unittest/_torch/attention/test_flashinfer_star_attn.py
similarity index 100%
rename from tests/unittest/_torch/test_flashinfer_star_attn.py
rename to tests/unittest/_torch/attention/test_flashinfer_star_attn.py
diff --git a/tests/unittest/_torch/test_vanilla_attention.py b/tests/unittest/_torch/attention/test_vanilla_attention.py
similarity index 100%
rename from tests/unittest/_torch/test_vanilla_attention.py
rename to tests/unittest/_torch/attention/test_vanilla_attention.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py
index 6df4b188ac6..1ff956580ee 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py
@@ -600,6 +600,7 @@ def test_trtllm_bench(llm_root): # noqa: F811
run_benchmark(model_name, dataset_path, temp_dir)
+@pytest.mark.skip(reason="https://nvbugs/5458798")
@pytest.mark.no_xdist
def test_trtllm_bench_backend_comparison(llm_root): # noqa: F811
"""Test that compares autodeploy backend performance against pytorch backend
diff --git a/tests/unittest/_torch/test_executor_request_queue.py b/tests/unittest/_torch/executor/test_executor_request_queue.py
similarity index 100%
rename from tests/unittest/_torch/test_executor_request_queue.py
rename to tests/unittest/_torch/executor/test_executor_request_queue.py
diff --git a/tests/unittest/_torch/test_overlap_scheduler.py b/tests/unittest/_torch/executor/test_overlap_scheduler.py
similarity index 100%
rename from tests/unittest/_torch/test_overlap_scheduler.py
rename to tests/unittest/_torch/executor/test_overlap_scheduler.py
diff --git a/tests/unittest/_torch/test_overlap_scheduler_input.json b/tests/unittest/_torch/executor/test_overlap_scheduler_input.json
similarity index 100%
rename from tests/unittest/_torch/test_overlap_scheduler_input.json
rename to tests/unittest/_torch/executor/test_overlap_scheduler_input.json
diff --git a/tests/unittest/_torch/test_pytorch_model_engine.py b/tests/unittest/_torch/executor/test_pytorch_model_engine.py
similarity index 100%
rename from tests/unittest/_torch/test_pytorch_model_engine.py
rename to tests/unittest/_torch/executor/test_pytorch_model_engine.py
diff --git a/tests/unittest/_torch/test_resource_manager.py b/tests/unittest/_torch/executor/test_resource_manager.py
similarity index 99%
rename from tests/unittest/_torch/test_resource_manager.py
rename to tests/unittest/_torch/executor/test_resource_manager.py
index 24320a993b3..dc67e9bbfcd 100644
--- a/tests/unittest/_torch/test_resource_manager.py
+++ b/tests/unittest/_torch/executor/test_resource_manager.py
@@ -23,7 +23,7 @@
LoraModule = tensorrt_llm.bindings.LoraModule
LoraModuleType = tensorrt_llm.bindings.LoraModuleType
current_dir = pathlib.Path(__file__).parent.resolve()
-root_dir = current_dir.parent.parent.parent
+root_dir = current_dir.parent.parent.parent.parent
sys.path.append(str(root_dir / "tests" / "integration"))
@@ -44,6 +44,8 @@ def setUpClass(cls):
"""
cpp_script_dir = os.path.join(cls.CPP_RESOURCES_DIR, "scripts")
+ # No reason to run this script for each test.
+ # TODO: move this to a fixture that runs once.
generate_lora_data_args_tp1 = [
sys.executable,
f"{cpp_script_dir}/generate_test_lora_weights.py",
diff --git a/tests/unittest/_torch/test_autotuner.py b/tests/unittest/_torch/misc/test_autotuner.py
similarity index 100%
rename from tests/unittest/_torch/test_autotuner.py
rename to tests/unittest/_torch/misc/test_autotuner.py
diff --git a/tests/unittest/_torch/test_share_tensor.py b/tests/unittest/_torch/misc/test_share_tensor.py
similarity index 100%
rename from tests/unittest/_torch/test_share_tensor.py
rename to tests/unittest/_torch/misc/test_share_tensor.py
diff --git a/tests/unittest/_torch/test_virtual_memory.py b/tests/unittest/_torch/misc/test_virtual_memory.py
similarity index 100%
rename from tests/unittest/_torch/test_virtual_memory.py
rename to tests/unittest/_torch/misc/test_virtual_memory.py
diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py
index 2d11971d99e..397314bcab0 100644
--- a/tests/unittest/_torch/modules/test_fused_moe.py
+++ b/tests/unittest/_torch/modules/test_fused_moe.py
@@ -289,6 +289,7 @@ def per_rank_test_fused_moe_alltoall(job_id):
assert r is None
+@pytest.mark.skip(reason="https://nvbugs/5467531")
@pytest.mark.skipif(torch.cuda.device_count() < 4,
reason="needs 4 GPUs to run this test")
@pytest.mark.parametrize("alltoall_method_type", [
diff --git a/tests/unittest/_torch/test_group_rmn_norm.py b/tests/unittest/_torch/modules/test_group_rmn_norm.py
similarity index 100%
rename from tests/unittest/_torch/test_group_rmn_norm.py
rename to tests/unittest/_torch/modules/test_group_rmn_norm.py
diff --git a/tests/unittest/_torch/test_mnnvl_memory.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_memory.py
similarity index 100%
rename from tests/unittest/_torch/test_mnnvl_memory.py
rename to tests/unittest/_torch/multi_gpu/test_mnnvl_memory.py
diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py
index 5c374d0f2aa..6149201d582 100644
--- a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py
+++ b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py
@@ -1,6 +1,7 @@
from difflib import SequenceMatcher
import pytest
+import torch
from utils.llm_data import llm_models_root
from tensorrt_llm import LLM, SamplingParams
@@ -43,19 +44,17 @@ def test_llama4(model_name, backend, tp_size, use_cuda_graph,
"This is a very long prompt to exercise long context. Count up to 10000 from 1, 2, 3,"
+ ", ".join(str(i) for i in range(4, 9000))
},
- # TODO: Fix multimodal test.
- # {
- # "prompt": "<|image|>This image is of color",
- # "multi_modal_data": {
- # "image": [torch.ones(3, 1024, 1024)]
- # }
- # },
+ {
+ "prompt": "<|image|>This image is of color",
+ "multi_modal_data": {
+ "image": [torch.ones(3, 1024, 1024)]
+ }
+ },
]
expected_outputs = [
- " the head of state and head of government of the",
- ", 9000, 9001, ",
- # " white. What is the color of the background of" # TODO: Fix multimodal test.
+ " the head of state and head of government of the", ", 9000, 9001, ",
+ " white. What is the color of the background of"
]
pytorch_config = dict(attn_backend=backend)
diff --git a/tests/unittest/_torch/test_beam_search.py b/tests/unittest/_torch/sampler/test_beam_search.py
similarity index 100%
rename from tests/unittest/_torch/test_beam_search.py
rename to tests/unittest/_torch/sampler/test_beam_search.py
diff --git a/tests/unittest/_torch/test_best_of_n.py b/tests/unittest/_torch/sampler/test_best_of_n.py
similarity index 100%
rename from tests/unittest/_torch/test_best_of_n.py
rename to tests/unittest/_torch/sampler/test_best_of_n.py
diff --git a/tests/unittest/_torch/test_return_logits.py b/tests/unittest/_torch/sampler/test_return_logits.py
similarity index 100%
rename from tests/unittest/_torch/test_return_logits.py
rename to tests/unittest/_torch/sampler/test_return_logits.py
diff --git a/tests/unittest/_torch/test_trtllm_sampler.py b/tests/unittest/_torch/sampler/test_trtllm_sampler.py
similarity index 82%
rename from tests/unittest/_torch/test_trtllm_sampler.py
rename to tests/unittest/_torch/sampler/test_trtllm_sampler.py
index d2fb0e9e65c..37227f9b53f 100644
--- a/tests/unittest/_torch/test_trtllm_sampler.py
+++ b/tests/unittest/_torch/sampler/test_trtllm_sampler.py
@@ -1,6 +1,3 @@
-import json
-from pathlib import Path
-
import pytest
from utils.llm_data import llm_models_root
from utils.util import similar
@@ -10,13 +7,6 @@
from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
-# A test case of mmlu_llama from lm_eval
-@pytest.fixture(scope="module")
-def test_case():
- with open(Path(__file__).parent / "test_overlap_scheduler_input.json") as f:
- return json.load(f)
-
-
@pytest.fixture(scope="module")
def model_path():
return llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
@@ -39,7 +29,7 @@ def create_llm(model_dir):
@pytest.mark.high_cuda_memory
-def test_trtllm_sampler(model_path, test_case):
+def test_trtllm_sampler(model_path):
prompts = [
"Magellan and Elcano lead the first",
"The capital of France is",
@@ -50,10 +40,10 @@ def test_trtllm_sampler(model_path, test_case):
["La Paz"]]
# Test configuration
- max_new_tokens = test_case["max_new_tokens"]
- temperature = test_case["temperature"]
- top_p = test_case["top_p"]
- stop_words = test_case["stop_words"]
+ max_new_tokens = 10
+ temperature = 1.0
+ top_p = None
+ stop_words = ["."]
sampling_config = SamplingParams(max_tokens=max_new_tokens,
n=1,
diff --git a/tests/unittest/_torch/test_custom_ops.py b/tests/unittest/_torch/thop/test_custom_ops.py
similarity index 100%
rename from tests/unittest/_torch/test_custom_ops.py
rename to tests/unittest/_torch/thop/test_custom_ops.py
diff --git a/tests/unittest/_torch/test_fp8_per_tensor_scale_tllmg_gemm.py b/tests/unittest/_torch/thop/test_fp8_per_tensor_scale_tllmg_gemm.py
similarity index 100%
rename from tests/unittest/_torch/test_fp8_per_tensor_scale_tllmg_gemm.py
rename to tests/unittest/_torch/thop/test_fp8_per_tensor_scale_tllmg_gemm.py