diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index afcf5adcda1..d8dc6a7580b 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -50,8 +50,8 @@ /tests/unittest/_torch/compilation @NVIDIA/trt-llm-torch-graph-compiler /tests/unittest/_torch/multi_gpu/test_ar_residual_norm.py @NVIDIA/trt-llm-torch-graph-compiler /tests/unittest/_torch/multi_gpu/test_user_buffers.py @NVIDIA/trt-llm-torch-graph-compiler -/tests/unittest/_torch/test_custom_ops.py @NVIDIA/trt-llm-torch-graph-compiler -/tests/unittest/_torch/test_autotuner.py @NVIDIA/trt-llm-torch-graph-compiler +/tests/unittest/_torch/thop/test_custom_ops.py @NVIDIA/trt-llm-torch-graph-compiler +/tests/unittest/_torch/misc/test_autotuner.py @NVIDIA/trt-llm-torch-graph-compiler ## TensorRT-LLM Pytorch - Attention /tensorrt_llm/_torch/attention_backend @NVIDIA/trt-llm-torch-attention-devs diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml index ec3f13abf45..9857e15e104 100644 --- a/.github/workflows/blossom-ci.yml +++ b/.github/workflows/blossom-ci.yml @@ -40,7 +40,7 @@ jobs: startsWith(github.event.comment.body, '/bot skip --comment') || startsWith(github.event.comment.body, '/bot reuse-pipeline') || startsWith(github.event.comment.body, '/bot kill')) && contains( - fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jhaotingc","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727","zackyoray","RoeyAzran1992","viraatc","v-shobhit","yuanjingx87","uchihatmtkinu","nvrohanv","vegaluisjose","qsang-nv","ChunhuanLin","timlee0212","venkywonka","zbpatel","tijyojwad","shyeh25","zihaok","nv-yilinf","ttyio","farazkh80","yuantailing","JennyLiu-nv","moraxu","IzzyPutterman","nvchenghaoz","nvxuanyuc","poweiw","stnie","zhanga5","nzmora-nvidia","greg-kwasniewski1","linda-stadter","Tom-Zheng","vanshilshah97","ixlmar","MatthiasKohl","Wanli-Jiang", "arekay", "davidclark-nv", "2ez4bz", "tcherckez-nvidia", "MrGeva", "galagam", "limin2021", "dhansen-nvidia","talorabr","kanghui0204","wu6u3tw","hvagadia","xavier-nvidia","raayandhar","dbari","nvjullin","elvischenv","zhenhuaw-me","weireweire","yifeizhang-c","jiaganc","ziyixiong-nv","FelixXidddd","JunyiXu-nv","bo-nv","zerollzeng","RayenTian","ameynaik-hub","raymochen","shuyixiong","johncalesp","leslie-fang25","reasonsolo","zhou-yuxin","vadiklyutiy","yali-arch","NVShreyas","h-guo18","pengbowang-nv","lancelly","heyuhhh","mayani-nv","flin3500","sunnyqgg","kris1025"]'), + fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jhaotingc","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727","zackyoray","RoeyAzran1992","viraatc","v-shobhit","yuanjingx87","uchihatmtkinu","nvrohanv","vegaluisjose","qsang-nv","ChunhuanLin","timlee0212","venkywonka","zbpatel","tijyojwad","shyeh25","zihaok","nv-yilinf","ttyio","farazkh80","yuantailing","JennyLiu-nv","moraxu","IzzyPutterman","nvchenghaoz","nvxuanyuc","poweiw","stnie","zhanga5","nzmora-nvidia","greg-kwasniewski1","linda-stadter","Tom-Zheng","vanshilshah97","ixlmar","MatthiasKohl","Wanli-Jiang", "arekay", "davidclark-nv", "2ez4bz", "tcherckez-nvidia", "MrGeva", "galagam", "limin2021", "dhansen-nvidia","talorabr","kanghui0204","wu6u3tw","hvagadia","xavier-nvidia","raayandhar","dbari","nvjullin","elvischenv","zhenhuaw-me","weireweire","yifeizhang-c","jiaganc","ziyixiong-nv","FelixXidddd","JunyiXu-nv","bo-nv","zerollzeng","RayenTian","ameynaik-hub","raymochen","shuyixiong","johncalesp","leslie-fang25","reasonsolo","zhou-yuxin","vadiklyutiy","yali-arch","NVShreyas","h-guo18","pengbowang-nv","lancelly","heyuhhh","mayani-nv","flin3500","sunnyqgg","kris1025", "karljang"]'), github.actor) steps: - name: Check if comment is issued by authorized person diff --git a/README.md b/README.md index f6625a05596..1559ee4d00c 100644 --- a/README.md +++ b/README.md @@ -18,10 +18,9 @@ TensorRT-LLM
## Tech Blogs -* [08/06] Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM +* [08/05] Running a High-Performance GPT-OSS-120B Inference Server with TensorRT-LLM ✨ [➡️ link](./docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md) - * [08/01] Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization) ✨ [➡️ link](./docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md) @@ -44,6 +43,7 @@ TensorRT-LLM ✨ [➡️ link](./docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md) ## Latest News +* [08/05] 🌟 TensorRT-LLM delivers Day-0 support for OpenAI's latest open-weights models: GPT-OSS-120B [➡️ link](https://huggingface.co/openai/gpt-oss-120b) and GPT-OSS-20B [➡️ link](https://huggingface.co/openai/gpt-oss-20b) * [07/15] 🌟 TensorRT-LLM delivers Day-0 support for LG AI Research's latest model, EXAONE 4.0 [➡️ link](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B) * [06/17] Join NVIDIA and DeepInfra for a developer meetup on June 26 ✨ [➡️ link](https://events.nvidia.com/scaletheunscalablenextgenai) * [05/22] Blackwell Breaks the 1,000 TPS/User Barrier With Meta’s Llama 4 Maverick diff --git a/cpp/include/tensorrt_llm/common/logger.h b/cpp/include/tensorrt_llm/common/logger.h index df84e226389..c8164b10e55 100644 --- a/cpp/include/tensorrt_llm/common/logger.h +++ b/cpp/include/tensorrt_llm/common/logger.h @@ -54,20 +54,21 @@ class Logger #if defined(_MSC_VER) template - void log(Level level, char const* format, Args const&... args); + void log(Level const level, char const* format, Args const&... args); template - void log(Level level, int rank, char const* format, Args const&... args); + void log(Level const level, int const rank, char const* format, Args const&... args); #else template - void log(Level level, char const* format, Args const&... args) __attribute__((format(printf, 3, 0))); + void log(Level const level, char const* format, Args const&... args) __attribute__((format(printf, 3, 0))); template - void log(Level level, int rank, char const* format, Args const&... args) __attribute__((format(printf, 4, 0))); + void log(Level const level, int const rank, char const* format, Args const&... args) + __attribute__((format(printf, 4, 0))); #endif template - void log(Level level, std::string const& format, Args const&... args) + void log(Level const level, std::string const& format, Args const&... args) { return log(level, format.c_str(), args...); } @@ -134,7 +135,7 @@ class Logger }; template -void Logger::log(Logger::Level level, char const* format, Args const&... args) +void Logger::log(Logger::Level const level, char const* format, Args const&... args) { if (isEnabled(level)) { diff --git a/cpp/kernels/fmha_v2/fmha_test.py b/cpp/kernels/fmha_v2/fmha_test.py index bd743d829a0..24e12f54634 100644 --- a/cpp/kernels/fmha_v2/fmha_test.py +++ b/cpp/kernels/fmha_v2/fmha_test.py @@ -165,8 +165,8 @@ def test_trtllm_context_mla_attention_fmha(dtype, s): if dtype == "-bf16" and s == 4096: epsilon += ' -epsilon 0.03' - if dtype in ["-e4m3", "-e4m3 -bf16-output"] and sm_version != 120: - pytest.skip("FP8 MLAs are only supported on sm120 currently.") + if dtype in ["-e4m3", "-e4m3 -bf16-output"] and sm_version not in [90, 120]: + pytest.skip("FP8 MLAs are only supported on sm90 and sm120 currently.") # Context phase kernels, always use separate-q-k-v layout. subprocess.run( diff --git a/cpp/kernels/fmha_v2/setup.py b/cpp/kernels/fmha_v2/setup.py index 220b7898a98..2d8a6b416a6 100644 --- a/cpp/kernels/fmha_v2/setup.py +++ b/cpp/kernels/fmha_v2/setup.py @@ -1914,8 +1914,9 @@ def enable_mutex(kspec): def enable_tma_store(kspec): + output_dtype = kspec.output_dtype if kspec.output_dtype is not None else kspec.dtype # TMA copies data in the 16B granularity. - return 'true' if (kspec.dtype in ['e4m3', 'e4m3_fp32'] + return 'true' if (output_dtype in ['e4m3', 'e4m3_fp32'] and kspec.head_size % 16 == 0) else 'false' @@ -3812,7 +3813,9 @@ def enumerate_qgmma_flash_warpspec_kernels(specs, # use specialized kernels for cases without alibi scales. # there is a numeric issues when applying the exp2f scale optimization and alibi scale at the same time. combinations = product([False, True], \ - [InputLayout.PACKED_QKV, InputLayout.CONTIGUOUS_Q_KV, InputLayout.Q_PAGED_KV], [False, True]) + [InputLayout.PACKED_QKV, InputLayout.CONTIGUOUS_Q_KV, + InputLayout.Q_PAGED_KV, InputLayout.SEPARATE_Q_K_V], + [False, True]) for (alibi, input_layout, enable_attn_logit_softcapping) in combinations: # alibi and bmm1_tanh_scale shouldn't be used together. if alibi and enable_attn_logit_softcapping: @@ -3911,7 +3914,7 @@ def enumerate_qgmma_flash_warpspec_kernels(specs, has_noloop=0, noloop_step=64, kv_loop_step= - 128, # use 64 kv step size to avoid register spilling + 128, # use 128 kv step size to avoid register spilling kv_tile_buffers=2, # only used by warp specialized kernels unroll_threshold=1, has_scale_max=False, @@ -3926,6 +3929,46 @@ def enumerate_qgmma_flash_warpspec_kernels(specs, sage_block_sizes=sage_block_sizes, output_dtype=output_dtype)) + # context MLA (192x128) + # we could use param 'output_dtype' of enumerate_qgmma_flash_warpspec_kernels(), + # but it will generate many unnecessary kernels and they are not easy to filter out. + for output_type in [None, 'bf16']: + specs.append( + kernel_spec( + sm=sm, + sm_mma=90, + dtype=dtype, + seq_len=0, # support any sequence length + head_size=192, + head_size_v=128, + warps_m=4, #4x1 warpgroups + warps_n=1, + version=2, + interleaved=False, + ldgsts_q= + False, # for Hopper kernels, ldgsts = False signals TMA usage. + ldgsts_k=False, + ldgsts_v=False, + share_smem_k_v=False, + loop_step=64, + q_tile_buffers=1, # only used by warp specialized kernels + has_noloop=0, + noloop_step=64, + kv_loop_step=128, + kv_tile_buffers=2, # only used by warp specialized kernels + unroll_threshold=1, + has_scale_max=False, + flash_attention=True, + warp_specialization=True, + alibi=alibi, + enable_attn_logit_softcapping=enable_attn_logit_softcapping, + return_softmax_stats= + False, # return softmax stats is not supported for fp8 now + scheduling_mode=scheduling_mode, + input_layout=input_layout, + sage_block_sizes=sage_block_sizes, + output_dtype=output_type)) + def enumerate_igmma_kernels(specs, sm=90): specs.append( @@ -6377,7 +6420,7 @@ def enumerate_kernels(): and kspec.tiled == True) # Deepseek MLA (context 192/128 separate-q-k-v) or (kspec.sm in [90, 100, 120] - and kspec.dtype in ['bf16', 'e4m3_fp32'] + and kspec.dtype in ['bf16', 'e4m3', 'e4m3_fp32'] and kspec.head_size == 192 and kspec.head_size_v == 128 and kspec.input_layout == InputLayout.SEPARATE_Q_K_V diff --git a/cpp/kernels/fmha_v2/src/fmha/hopper/gmem_tile_o_packed.h b/cpp/kernels/fmha_v2/src/fmha/hopper/gmem_tile_o_packed.h index 75946bac612..e186ab8ad26 100644 --- a/cpp/kernels/fmha_v2/src/fmha/hopper/gmem_tile_o_packed.h +++ b/cpp/kernels/fmha_v2/src/fmha/hopper/gmem_tile_o_packed.h @@ -1222,6 +1222,14 @@ struct Gmem_tile_o_qgmma_fp32_16bits inline __device__ Gmem_tile_o_qgmma_fp32_16bits( Params const& params, Block_info const& block_info, Shared&&, int tidx, int cta_row_offset = 0) : params_o_stride_in_bytes_(params.o_stride_in_bytes) + , params_scale_bmm2_( +#ifdef GENERATE_CUBIN + // Specialized for trt-llm generated cubins only. + params.scale_bmm2_d ? *params.scale_bmm2_d : params.scale_bmm2 +#else + params.scale_bmm2 +#endif + ) , actual_seqlen_(block_info.actual_seqlen) , o_ptr_(reinterpret_cast(params.o_ptr)) { @@ -1251,21 +1259,24 @@ struct Gmem_tile_o_qgmma_fp32_16bits inline __device__ void store(Accumulators const (&acc)[M][N]) { int64_t const step_m = 8 * params_o_stride_in_bytes_; - // we assume M = 1. some shortcuts. - static_assert(M == 1); - -#define STORE_COLUMN(idx) \ - { \ - float _reg0 = acc[0][mma_ni].elt(((ci + 0) * ROWS_PER_THREAD + ri) * 2 + idx); \ - float _reg1 = acc[0][mma_ni].elt(((ci + 1) * ROWS_PER_THREAD + ri) * 2 + idx); \ - static_assert(std::is_same_v || std::is_same_v); \ - uint32_t _out = fmha::float2_to_16bit_2(_reg0, _reg1); \ - int64_t _offset = (int64_t) ri * step_m + (int64_t) (ci + mma_ni * COLS_PER_THREAD) * STEP_N; \ - fmha::stg(o_ptr_ + _offset + 4 * idx, _out); \ - } +#ifdef UNIFIED_EPILOGUE_SCALE + constexpr bool Scale = false; +#else + constexpr bool Scale = true; +#endif #define STORE_COLUMNS() \ { \ - STORE_COLUMN(0) STORE_COLUMN(1) \ + /* we assume M = 1. some shortcuts. */ \ + static_assert(M == 1); \ + uint4 _src = { \ + .x = acc[0][mma_ni].reg(((ci + 0) * ROWS_PER_THREAD + ri) * 2), \ + .y = acc[0][mma_ni].reg(((ci + 1) * ROWS_PER_THREAD + ri) * 2), \ + .z = acc[0][mma_ni].reg(((ci + 0) * ROWS_PER_THREAD + ri) * 2 + 1), \ + .w = acc[0][mma_ni].reg(((ci + 1) * ROWS_PER_THREAD + ri) * 2 + 1), \ + }; \ + uint2 _dst = Acc_packer::run(this, _src); \ + int64_t _offset = (int64_t) ri * step_m + (int64_t) (ci + mma_ni * COLS_PER_THREAD) * STEP_N; \ + fmha::stg(o_ptr_ + _offset, _dst); \ } #pragma unroll @@ -1303,6 +1314,10 @@ struct Gmem_tile_o_qgmma_fp32_16bits // The stride between rows for the QKV matrice. int64_t params_o_stride_in_bytes_; + // Scaling factor; this usually means QKV descale factor in actuality + uint32_t params_scale_bmm2_; + // Scaling factor; this usually means QKV descale factor in actuality + uint32_t params_scale_bmm2_; // The pointer. char* o_ptr_; // The row loaded by this thread. diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h index c8e3c318d6f..1a8853a03ec 100644 --- a/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h +++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h @@ -755,7 +755,7 @@ struct DMA for (int kgroup_idx = 0; kgroup_idx < Kernel_traits::BMM2_K_GROUPS; kgroup_idx++) { #pragma unroll - for (int dgroup_idx = 0; dgroup_idx < Kernel_traits::D_GROUPS; dgroup_idx++) + for (int dgroup_idx = 0; dgroup_idx < Kernel_traits::DV_GROUPS; dgroup_idx++) { // Src smem block is k first then d uint32_t src_offset = (kgroup_idx * Kernel_traits::BMM2_K_PER_GROUP * Kernel_traits::D_PER_GROUP @@ -764,7 +764,7 @@ struct DMA // Dst smem block is d first then k uint32_t dst_offset = (dgroup_idx * Kernel_traits::BMM2_K_PER_GROUP * Kernel_traits::D_PER_GROUP - + kgroup_idx * Kernel_traits::BMM2_K_PER_GROUP * Kernel_traits::D) + + kgroup_idx * Kernel_traits::BMM2_K_PER_GROUP * Kernel_traits::DV) * Kernel_traits::ELEMENT_BYTES; transposer.template transpose_(smem_v_src + src_offset, smem_v_dst + dst_offset); diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h index 8c93ce8a988..e12847d7659 100644 --- a/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h +++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h @@ -589,7 +589,8 @@ struct Kernel_traits_Hopper_qgmma_e4m3_fp32 // Base class. using Base = Kernel_traits; + SCHEDULING_MODE_, INPUT_LAYOUT_, USE_TMA_STORE_, ENABLE_BMM1_SOFTCAPPING_SCALE_, RETURN_SOFTMAX_STATS_, + OutputType, SAGE_BLOCK_SIZE_Q_, SAGE_BLOCK_SIZE_K_, SAGE_BLOCK_SIZE_V_>; enum { diff --git a/cpp/kernels/xqa/mha_sm90.cu b/cpp/kernels/xqa/mha_sm90.cu index 9a438df9a2a..da44fba60c4 100644 --- a/cpp/kernels/xqa/mha_sm90.cu +++ b/cpp/kernels/xqa/mha_sm90.cu @@ -1012,7 +1012,7 @@ CUBIN_EXPORT __global__ if (threadIdx.x < smem.gemm1AccColMax.size) { auto const idx = threadIdx.x; - smem.gemm1AccColMax[idx] = mha::numeric_limits::lowest(); + smem.gemm1AccColMax[idx] = safeInitRowMax; smem.gemm1AccColSum[idx] = 0; } smem.gemm1WarpGrpBar.arrive_and_wait(); @@ -1949,7 +1949,7 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec, uint32_t const globalRow = tileStartRow + row; if (globalRow >= cacheSeqLen) { - acc(m, n)(i, j) = mha::numeric_limits::lowest(); + acc(m, n)(i, j) = safeInitRowMax; continue; } if (globalRow >= maskStartRow) @@ -1957,7 +1957,7 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec, uint32_t const maskRow = globalRow - maskStartRow; if ((bit_mask >> maskRow) == 0) { - acc(m, n)(i, j) = mha::numeric_limits::lowest(); + acc(m, n)(i, j) = safeInitRowMax; } } } @@ -2087,7 +2087,7 @@ __device__ inline void warpGrpApplyMask(uint32_t warpRank, Gemm0Acc& acc, uint32 #pragma unroll for (uint32_t j = 0; j < GmmaAccCoreMat::cols; j++) { - acc(m, n)(i, j) = mha::numeric_limits::lowest(); + acc(m, n)(i, j) = safeInitRowMax; } } } @@ -2380,9 +2380,9 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec, { uint32_t const col = GmmaAccCoreMat::cols * (4 * n + idxInQuad) + j; assert((col < nbValidCols) == bool(endMask & (1ULL << col))); - if (((mask >> col) & 1) == 0) + if ((mask & (1ULL << col)) == 0) { - acc(m, n)(i, j) = mha::numeric_limits::lowest(); + acc(m, n)(i, j) = safeInitRowMax; } } } @@ -2410,7 +2410,7 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, uint32_t validColBeg, uin #pragma unroll for (uint32_t i = 0; i < GmmaAccCoreMat::rows; i++) { - acc(m, n)(i, j) = mha::numeric_limits::lowest(); + acc(m, n)(i, j) = safeInitRowMax; } } } diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp index bbe2e2fd179..866a0d71c8f 100644 --- a/cpp/tensorrt_llm/common/attentionOp.cpp +++ b/cpp/tensorrt_llm/common/attentionOp.cpp @@ -285,6 +285,9 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams& xqaParams.fp4_out_sf_scale = generationsParams.attention_output_sf_scale; xqaParams.start_token_idx_sf = generationsParams.start_token_idx_sf; + // Cross attention parameters. + xqaParams.encoder_input_lengths = generationsParams.encoder_input_lengths; + return true; } @@ -2229,6 +2232,10 @@ int AttentionOp::enqueueGeneration(EnqueueGenerationParams const& params, cud { TLLM_CHECK_WITH_INFO(false, "No available kernels are found for FP4 output."); } + else + { + TLLM_LOG_DEBUG("XQA kernels are not selected in the generation phase."); + } } // This is the number of kv tokens that q needs to visit, but excluding one as it will be processed before the kv @@ -2750,7 +2757,7 @@ int AttentionOp::initialize() noexcept !useCustomMask() || mEnableContextFMHA, "Only Context FMHA supports custom mask input currently."); } - mEnableXQA = (mEnableXQA || mIsSpecDecodingEnabled) && !mCrossAttention + mEnableXQA = (mEnableXQA || mIsSpecDecodingEnabled) && (mType == nvinfer1::DataType::kHALF || mType == nvinfer1::DataType::kBF16) && mUseKVCache; if (mEnableXQA) diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp index 2332950629f..0ce601d5b08 100644 --- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp @@ -1524,6 +1524,11 @@ struct CollectiveMmaArrayMixedInput< CUTLASS_DEVICE void tensormaps_cp_fence_release( TensorMapStorage& shared_tensormaps, cute::tuple const& input_tensormaps) { + if (cute::elect_one_sync()) + { + cute::tma_desc_commit_group(); + cute::tma_desc_wait_group(); + } // Entire warp must do this (i.e. it's aligned) tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A); tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B); diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h index fbcd279420d..0c2f3aed72b 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h @@ -260,6 +260,8 @@ extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_tma_ws_sm90 extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); +extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream); @@ -1969,6 +1971,10 @@ static const struct FusedMultiHeadAttentionKernelMetaInfoV2 { DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 180480, 384, 64, 2, 2, false, true, true, true, false, false, true, false, nullptr}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_causal_softcapping_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 3, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 3, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_BF16, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90_kernel", 164096, 384, 64, 0, 3, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90}, +{ DATA_TYPE_E4M3, DATA_TYPE_BF16, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_causal_output_bf16_tma_ws_sm90_kernel", 164096, 384, 64, 1, 3, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_causal_alibi_tma_ws_sm90_kernel", 82304, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90}, diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h index 97ad58335fc..fcf8ab3851f 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h @@ -106,6 +106,9 @@ struct XQAParams void* quant_q_buffer_ptr = nullptr; + // for cross attention + int32_t const* encoder_input_lengths = nullptr; + cudaStream_t stream = 0; std::string toString() const @@ -175,6 +178,7 @@ struct XQAParams << "total_num_input_tokens :" << total_num_input_tokens << std ::endl << "is_fp8_output :" << (is_fp8_output ? "true" : "false") << std ::endl << "fp8_out_scale :" << fp8_out_scale << std ::endl + << "encoder_input_lengths: " << encoder_input_lengths << std::endl << "stream :" << stream; return ss.str(); diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h index 65e55c65fba..4442e2c2369 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h @@ -1348,15 +1348,17 @@ __global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams(global_token_idx) * params.kv_hidden_size * 2 + hidden_idx_kv; - auto const src_v_idx - = static_cast(global_token_idx) * params.kv_hidden_size * 2 + src_v_offset + hidden_idx_kv; - - // Only load K,V tokens from encoder qkv input. - auto k = *reinterpret_cast(¶ms.cross_kv_input[src_k_idx]); - auto v = *reinterpret_cast(¶ms.cross_kv_input[src_v_idx]); - - // The kv cache pointers. - auto k_cache_block_ptr - = reinterpret_cast(params.kv_cache_buffer.getKBlockPtr(batch_idx, token_idx)); - auto v_cache_block_ptr - = reinterpret_cast(params.kv_cache_buffer.getVBlockPtr(batch_idx, token_idx)); - // The vector idx in the cache block. - auto block_vec_idx - = params.kv_cache_buffer.getKVLocalIdx(token_idx, kv_head_idx, VECS_PER_HEAD, head_dim_vec_idx); - - // Store K and V to the cache. - // INT8/FP8 kv cache. - if constexpr (sizeof(TCache) == 1) - { - // The element index inside the block. - auto block_elt_idx = block_vec_idx * ELTS_PER_VEC; - // Store 8bits kv cache. - mmha::store_8bits_vec(k_cache_block_ptr, k, block_elt_idx, scale_orig_quant); - mmha::store_8bits_vec(v_cache_block_ptr, v, block_elt_idx, scale_orig_quant); - } - else + // Encoder tokens (i.e. KV tokens). + if (head_idx == (kv_head_idx * params.qheads_per_kv_head) && token_idx < encoder_seq_len + && store_encoder_kv_cache && params.kv_cache_buffer.data != nullptr) { - reinterpret_cast(k_cache_block_ptr)[block_vec_idx] = k; - reinterpret_cast(v_cache_block_ptr)[block_vec_idx] = v; + // The global token idx in all sequences. + int global_token_idx = token_idx + encoder_seq_offset; + + // The memory offset. + auto const src_k_idx + = static_cast(global_token_idx) * params.kv_hidden_size * 2 + hidden_idx_kv; + auto const src_v_idx + = static_cast(global_token_idx) * params.kv_hidden_size * 2 + src_v_offset + hidden_idx_kv; + + // Only load K,V tokens from encoder qkv input. + auto k = *reinterpret_cast(¶ms.cross_kv_input[src_k_idx]); + auto v = *reinterpret_cast(¶ms.cross_kv_input[src_v_idx]); + + // The kv cache pointers. + auto k_cache_block_ptr + = reinterpret_cast(params.kv_cache_buffer.getKBlockPtr(batch_idx, token_idx)); + auto v_cache_block_ptr + = reinterpret_cast(params.kv_cache_buffer.getVBlockPtr(batch_idx, token_idx)); + // The vector idx in the cache block. + auto block_vec_idx + = params.kv_cache_buffer.getKVLocalIdx(token_idx, kv_head_idx, VECS_PER_HEAD, head_dim_vec_idx); + + // Store K and V to the cache. + // INT8/FP8 kv cache. + if constexpr (sizeof(TCache) == 1) + { + // The element index inside the block. + auto block_elt_idx = block_vec_idx * ELTS_PER_VEC; + // Store 8bits kv cache. + mmha::store_8bits_vec(k_cache_block_ptr, k, block_elt_idx, scale_orig_quant); + mmha::store_8bits_vec(v_cache_block_ptr, v, block_elt_idx, scale_orig_quant); + } + else + { + reinterpret_cast(k_cache_block_ptr)[block_vec_idx] = k; + reinterpret_cast(v_cache_block_ptr)[block_vec_idx] = v; + } } } } diff --git a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp index 34fe1780fe0..956d1919ced 100644 --- a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp +++ b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp @@ -16,7 +16,9 @@ #include "xqaDispatcher.h" #include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h" #include "tensorrt_llm/kernels/unfusedAttentionKernels.h" +#include namespace { @@ -38,6 +40,87 @@ constexpr inline T roundUp(T a, T b) namespace tensorrt_llm::kernels { +namespace +{ + +template +QKVPreprocessingParams makeQKVPreprocessingParams(XQAParams const& params, + XQALaunchParam const& launchParams, void* xqa_q_input_ptr, Data_type QDataType, + KvCacheDataType cache_type, int32_t batch_beam_size, KVCacheBuffer const& kv_cache_buffer, + int32_t const* cu_seqlens, int32_t const* cu_kv_seqlens, float const* rotary_inv_freq_buf, int multiProcessorCount) +{ + QKVPreprocessingParams preprocessingParms; + memset(&preprocessingParms, 0, sizeof(preprocessingParms)); + // Set parameters. + preprocessingParms.qkv_input = static_cast(const_cast(params.qkv)); + preprocessingParms.q_output = static_cast(xqa_q_input_ptr); + preprocessingParms.kv_cache_buffer = kv_cache_buffer; + preprocessingParms.kv_cache_block_scales_buffer = {}; + preprocessingParms.qkv_bias = static_cast(params.qkv_bias); + // Prepare values for fmha. + preprocessingParms.fmha_bmm1_scale = launchParams.bmm1_scale_ptr; + preprocessingParms.fmha_bmm2_scale = launchParams.bmm2_scale_ptr; + bool const is_fp8_q_input = (QDataType == DATA_TYPE_E4M3); + if (params.kv_cache_quant_mode.hasFp8KvCache()) + { + preprocessingParms.q_scale_quant_orig = params.kv_scale_quant_orig; + preprocessingParms.kv_scale_quant_orig = params.kv_scale_quant_orig; + } + if (params.is_fp8_output) + { + preprocessingParms.o_scale_orig_quant = params.fp8_out_scale; + } + // Buffers. + preprocessingParms.logn_scaling = params.logn_scaling_ptr; + preprocessingParms.seq_lens = params.spec_decoding_generation_lengths; + preprocessingParms.cache_seq_lens = params.sequence_lengths; + preprocessingParms.cu_seq_lens = cu_seqlens; + preprocessingParms.rotary_embedding_inv_freq = rotary_inv_freq_buf; + preprocessingParms.rotary_coef_cache_buffer = params.rotary_cos_sin; + preprocessingParms.kvScaleOrigQuant = params.kv_scale_orig_quant; + preprocessingParms.kv_cache_scale_factors = nullptr; + preprocessingParms.spec_decoding_position_offsets + = params.cross_attention ? nullptr : params.spec_decoding_position_offsets; + preprocessingParms.mrope_position_deltas = params.mrope_position_deltas; + // Scalar parameters. + preprocessingParms.batch_size = int(batch_beam_size); + preprocessingParms.max_input_seq_len = params.generation_input_length; + preprocessingParms.max_kv_seq_len = params.max_past_kv_length; + preprocessingParms.cyclic_kv_cache_len + = params.cross_attention ? params.max_past_kv_length : params.cyclic_attention_window_size; + preprocessingParms.sink_token_len = params.cross_attention ? 0 : params.sink_token_length; + preprocessingParms.token_num = params.total_num_input_tokens; + preprocessingParms.remove_padding = true; + preprocessingParms.cross_attention = params.cross_attention; + preprocessingParms.head_num = params.num_q_heads; + preprocessingParms.kv_head_num = params.num_kv_heads; + preprocessingParms.qheads_per_kv_head = params.num_q_heads / params.num_kv_heads; + preprocessingParms.size_per_head = params.head_size; + preprocessingParms.fmha_host_bmm1_scale = 1.0f / (sqrtf(params.head_size * 1.0f) * params.q_scaling); + preprocessingParms.rotary_embedding_dim = params.rotary_embedding_dim; + preprocessingParms.rotary_embedding_base = params.rotary_embedding_base; + preprocessingParms.rotary_scale_type = params.rotary_embedding_scale_type; + preprocessingParms.rotary_embedding_scale = params.rotary_embedding_scale; + preprocessingParms.rotary_embedding_max_positions = params.rotary_embedding_max_positions; + preprocessingParms.position_embedding_type = params.position_embedding_type; + preprocessingParms.position_shift_enabled = params.position_shift_enabled; + preprocessingParms.cache_type = cache_type; + preprocessingParms.separate_q_kv_output = true; + preprocessingParms.quantized_fp8_output = is_fp8_q_input; + preprocessingParms.generation_phase = true; + preprocessingParms.multi_processor_count = multiProcessorCount; + preprocessingParms.rotary_vision_start = params.rotary_vision_start; + preprocessingParms.rotary_vision_length = params.rotary_vision_length; + + // Cross-attention only. + + preprocessingParms.encoder_seq_lens = params.encoder_input_lengths; + + return preprocessingParms; +} + +} // namespace + //////////////////////////////////////////////////////////////////////////////////////////////////// XqaDispatcher::XqaDispatcher(XqaFixedParams fixedParams) @@ -137,9 +220,10 @@ bool XqaDispatcher::shouldUse(XQAParams const& params) { SHOULD_NOT_USE("Fallback to MMHA as unidirectional is not supported by TRTLLM-GEN kernels."); } - if (params.cross_attention) + if (params.cross_attention && !params.paged_kv_cache) { - SHOULD_NOT_USE("Fallback to MMHA as cross attention is not supported by TRTLLM-GEN kernels."); + SHOULD_NOT_USE( + "Fallback to MMHA as cross attention without paged KV Cache is not supported by TRTLLM-GEN kernels."); } if (params.paged_kv_cache && params.tokens_per_block < 8) { @@ -252,8 +336,8 @@ void XqaDispatcher::runImpl(XQAParams params, KVCacheBuffer const& kv_cache_buff decoder_params.seqQOffsets = launchParams.cu_seq_lens; decoder_params.seqKVOffsets = launchParams.cu_kv_seq_lens; decoder_params.seqQLengths = params.spec_decoding_generation_lengths; - decoder_params.seqKVLengths = params.sequence_lengths; - decoder_params.batchSize = int(batch_beam_size); + decoder_params.seqKVLengths = params.cross_attention ? params.encoder_input_lengths : params.sequence_lengths; + decoder_params.batchSize = static_cast(batch_beam_size); decoder_params.maxQSeqLength = params.generation_input_length; decoder_params.numTokens = params.total_num_input_tokens; decoder_params.removePadding = true; @@ -273,10 +357,12 @@ void XqaDispatcher::runImpl(XQAParams params, KVCacheBuffer const& kv_cache_buff float const* rotary_inv_freq_buf = params.rotary_embedding_inv_freq_cache; // Use the nullptr for cu_seqlens when it is not computed. int const* cu_seqlens{nullptr}; + int const* cu_kv_seqlens{nullptr}; if (decoder_params.isBuildDecoderInfoKernelNeeded()) { rotary_inv_freq_buf = launchParams.rotary_inv_freq_buf; cu_seqlens = launchParams.cu_seq_lens; + cu_kv_seqlens = launchParams.cu_kv_seq_lens; invokeBuildDecoderInfo(decoder_params, params.stream); sync_check_cuda_error(params.stream); } @@ -285,66 +371,10 @@ void XqaDispatcher::runImpl(XQAParams params, KVCacheBuffer const& kv_cache_buff // NOTE: MHA kernels should read kv cache that has already been appended with new tokens' kv cache. void* xqa_q_input_ptr = inputScratch; // The preprocessing kernel that applies RoPE and updates kv cache. - QKVPreprocessingParams preprocessingParms; - memset(&preprocessingParms, 0, sizeof(preprocessingParms)); - // Set parameters. - preprocessingParms.qkv_input = static_cast(const_cast(params.qkv)); - preprocessingParms.q_output = static_cast(xqa_q_input_ptr); - preprocessingParms.kv_cache_buffer = kv_cache_buffer; - preprocessingParms.kv_cache_block_scales_buffer = {}; - preprocessingParms.qkv_bias = static_cast(params.qkv_bias); - // Prepare values for fmha. - preprocessingParms.fmha_bmm1_scale = launchParams.bmm1_scale_ptr; - preprocessingParms.fmha_bmm2_scale = launchParams.bmm2_scale_ptr; - bool const is_fp8_q_input = (mQDataType == DATA_TYPE_E4M3); - if (params.kv_cache_quant_mode.hasFp8KvCache()) - { - preprocessingParms.q_scale_quant_orig = params.kv_scale_quant_orig; - preprocessingParms.kv_scale_quant_orig = params.kv_scale_quant_orig; - } - if (params.is_fp8_output) - { - preprocessingParms.o_scale_orig_quant = params.fp8_out_scale; - } - // Buffers. - preprocessingParms.logn_scaling = params.logn_scaling_ptr; - preprocessingParms.seq_lens = params.spec_decoding_generation_lengths; - preprocessingParms.cache_seq_lens = params.sequence_lengths; - preprocessingParms.cu_seq_lens = cu_seqlens; - preprocessingParms.rotary_embedding_inv_freq = rotary_inv_freq_buf; - preprocessingParms.rotary_coef_cache_buffer = params.rotary_cos_sin; - preprocessingParms.kvScaleOrigQuant = params.kv_scale_orig_quant; - preprocessingParms.kv_cache_scale_factors = nullptr; - preprocessingParms.spec_decoding_position_offsets = params.spec_decoding_position_offsets; - preprocessingParms.mrope_position_deltas = params.mrope_position_deltas; - // Scalar parameters. - preprocessingParms.batch_size = int(batch_beam_size); - preprocessingParms.max_input_seq_len = params.generation_input_length; - preprocessingParms.max_kv_seq_len = params.max_past_kv_length; - preprocessingParms.cyclic_kv_cache_len = params.cyclic_attention_window_size; - preprocessingParms.sink_token_len = params.sink_token_length; - preprocessingParms.token_num = params.total_num_input_tokens; - preprocessingParms.remove_padding = true; - preprocessingParms.cross_attention = false; - preprocessingParms.head_num = params.num_q_heads; - preprocessingParms.kv_head_num = params.num_kv_heads; - preprocessingParms.qheads_per_kv_head = params.num_q_heads / params.num_kv_heads; - preprocessingParms.size_per_head = params.head_size; - preprocessingParms.fmha_host_bmm1_scale = 1.0f / (sqrtf(params.head_size * 1.0f) * params.q_scaling); - preprocessingParms.rotary_embedding_dim = params.rotary_embedding_dim; - preprocessingParms.rotary_embedding_base = params.rotary_embedding_base; - preprocessingParms.rotary_scale_type = params.rotary_embedding_scale_type; - preprocessingParms.rotary_embedding_scale = params.rotary_embedding_scale; - preprocessingParms.rotary_embedding_max_positions = params.rotary_embedding_max_positions; - preprocessingParms.position_embedding_type = params.position_embedding_type; - preprocessingParms.position_shift_enabled = params.position_shift_enabled; - preprocessingParms.cache_type = cache_type; - preprocessingParms.separate_q_kv_output = true; - preprocessingParms.quantized_fp8_output = is_fp8_q_input; - preprocessingParms.generation_phase = true; - preprocessingParms.multi_processor_count = mMultiProcessorCount; - preprocessingParms.rotary_vision_start = params.rotary_vision_start; - preprocessingParms.rotary_vision_length = params.rotary_vision_length; + + auto preprocessingParms = makeQKVPreprocessingParams(params, launchParams, xqa_q_input_ptr, + mQDataType, cache_type, batch_beam_size, kv_cache_buffer, cu_seqlens, cu_kv_seqlens, rotary_inv_freq_buf, + mMultiProcessorCount); invokeQKVPreprocessing(preprocessingParms, params.stream); sync_check_cuda_error(params.stream); @@ -394,7 +424,7 @@ void XqaDispatcher::runImpl(XQAParams params, KVCacheBuffer const& kv_cache_buff = reinterpret_cast(launchParams.bmm1_scale_ptr + kIdxScaleSoftmaxLog2Ptr); tllmRunnerParams.oSfScalePtr = params.fp4_out_sf_scale; // The sequence lengths for K/V. - tllmRunnerParams.seqLensKvPtr = params.sequence_lengths; + tllmRunnerParams.seqLensKvPtr = params.cross_attention ? params.encoder_input_lengths : params.sequence_lengths; tllmRunnerParams.oPtr = params.output; tllmRunnerParams.oSfPtr = params.output_sf; diff --git a/cpp/tests/unit_tests/executor/transferAgentTest.cpp b/cpp/tests/unit_tests/executor/transferAgentTest.cpp index c73d9a2140b..4745e8e40b1 100644 --- a/cpp/tests/unit_tests/executor/transferAgentTest.cpp +++ b/cpp/tests/unit_tests/executor/transferAgentTest.cpp @@ -255,7 +255,8 @@ TEST_F(TransferAgentTest, SyncMessage) checked = nixlAgent0->checkRemoteDescs(agent1, regMem3.getDescs()); } while (!checked); auto syncMessage = std::string("agent_sync_message"); - TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem3.getDescs(), agent1, syncMessage}; + nixlAgent0->notifySyncMessage(agent1, syncMessage); + TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem3.getDescs(), agent1}; auto status = nixlAgent0->submitTransferRequests(writeReq); auto notif = nixlAgent1->getNotifiedSyncMessages(); @@ -302,7 +303,8 @@ TEST_F(TransferAgentTest, SyncMessage) } while (!checked2); std::string syncMessage4 = "four_agent_sync_message"; - TransferRequest writeReq1{TransferOp::kWRITE, regMem2.getDescs(), regMem1.getDescs(), agent0, syncMessage4}; + nixlAgent1->notifySyncMessage(agent0, syncMessage4); + TransferRequest writeReq1{TransferOp::kWRITE, regMem2.getDescs(), regMem1.getDescs(), agent0}; auto status1 = nixlAgent1->submitTransferRequests(writeReq1); auto notif4 = nixlAgent0->getNotifiedSyncMessages(); for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif4.size() == 0; i++) diff --git a/docker/common/install_nixl.sh b/docker/common/install_nixl.sh index 18ee554f693..cecd61a7af4 100644 --- a/docker/common/install_nixl.sh +++ b/docker/common/install_nixl.sh @@ -4,8 +4,9 @@ set -ex GITHUB_URL="https://github.com" UCX_INSTALL_PATH="/usr/local/ucx/" CUDA_PATH="/usr/local/cuda" -NIXL_VERSION="0.3.1" +NIXL_VERSION="0.5.0" NIXL_REPO="https://github.com/ai-dynamo/nixl.git" +OLD_LD_LIBRARY_PATH=$LD_LIBRARY_PATH ARCH_NAME="x86_64-linux-gnu" GDS_PATH="$CUDA_PATH/targets/x86_64-linux" @@ -18,25 +19,26 @@ pip3 install --no-cache-dir meson ninja pybind11 git clone --depth 1 -b ${NIXL_VERSION} ${NIXL_REPO} cd nixl -cuda_path=$(find / -name "libcuda.so.1" 2>/dev/null | head -n1) -if [[ -z "$cuda_path" ]]; then - echo "libcuda.so.1 not found " +CUDA_SO_PATH=$(find "/usr/local" -name "libcuda.so.1" 2>/dev/null | head -n1) + +if [[ -z "$CUDA_SO_PATH" ]]; then + echo "libcuda.so.1 not found" exit 1 fi -ln -sf $cuda_path $CUDA_PATH/lib64/libcuda.so.1 +CUDA_SO_PATH=$(dirname $CUDA_SO_PATH) +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_SO_PATH meson setup builddir \ -Ducx_path=$UCX_INSTALL_PATH \ -Dcudapath_lib="$CUDA_PATH/lib64" \ -Dcudapath_inc="$CUDA_PATH/include" \ -Dgds_path="$GDS_PATH" \ - -Dinstall_headers=true \ - -Dstatic_plugins=UCX + -Dinstall_headers=true cd builddir && ninja install cd ../.. rm -rf nixl* # Remove NIXL source tree to save space -rm $CUDA_PATH/lib64/libcuda.so.1 +export LD_LIBRARY_PATH=$OLD_LD_LIBRARY_PATH echo "export LD_LIBRARY_PATH=/opt/nvidia/nvda_nixl/lib/${ARCH_NAME}:/opt/nvidia/nvda_nixl/lib64:\$LD_LIBRARY_PATH" >> "${ENV}" diff --git a/docker/common/install_ucx.sh b/docker/common/install_ucx.sh index 22f444d9746..ba35e82ce63 100644 --- a/docker/common/install_ucx.sh +++ b/docker/common/install_ucx.sh @@ -2,29 +2,28 @@ set -ex GITHUB_URL="https://github.com" -UCX_VERSION="v1.18.1" +UCX_VERSION="v1.19.x" UCX_INSTALL_PATH="/usr/local/ucx/" CUDA_PATH="/usr/local/cuda" UCX_REPO="https://github.com/openucx/ucx.git" -if [ ! -d ${UCX_INSTALL_PATH} ]; then - git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO} - cd ucx - ./autogen.sh - ./contrib/configure-release \ - --prefix=${UCX_INSTALL_PATH} \ - --enable-shared \ - --disable-static \ - --disable-doxygen-doc \ - --enable-optimizations \ - --enable-cma \ - --enable-devel-headers \ - --with-cuda=${CUDA_PATH} \ - --with-verbs \ - --with-dm \ - --enable-mt - make install -j$(nproc) - cd .. - rm -rf ucx # Remove UCX source to save space - echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}" -fi +rm -rf ${UCX_INSTALL_PATH} +git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO} +cd ucx +./autogen.sh +./contrib/configure-release \ + --prefix=${UCX_INSTALL_PATH} \ + --enable-shared \ + --disable-static \ + --disable-doxygen-doc \ + --enable-optimizations \ + --enable-cma \ + --enable-devel-headers \ + --with-cuda=${CUDA_PATH} \ + --with-verbs \ + --with-dm \ + --enable-mt +make install -j$(nproc) +cd .. +rm -rf ucx # Remove UCX source to save space +echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}" diff --git a/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md b/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md index 8f5c1dfec0f..87432173b42 100644 --- a/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md +++ b/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md @@ -19,11 +19,11 @@ We have a forthcoming guide for achieving great performance on H100; however, th In this section, we introduce several ways to install TensorRT-LLM. -### NGC Docker Image of dev branch +### NGC Docker Image -Day-0 support for gpt-oss is provided via the NGC container image `nvcr.io/nvidia/tensorrt-llm/release:gpt-oss-dev`. This image was built on top of the pre-day-0 **dev branch**. This container is multi-platform and will run on both x64 and arm64 architectures. +Visit the [NGC TensorRT-LLM Release page](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release) to find the most up-to-date NGC container image to use. You can also check the latest [release notes](https://github.com/NVIDIA/TensorRT-LLM/releases) to keep track of the support status of the latest releases. -Run the following docker command to start the TensorRT-LLM container in interactive mode: +Run the following Docker command to start the TensorRT-LLM container in interactive mode (change the image tag to match latest release): ```bash docker run --rm --ipc=host -it \ @@ -33,7 +33,7 @@ docker run --rm --ipc=host -it \ -p 8000:8000 \ -e TRTLLM_ENABLE_PDL=1 \ -v ~/.cache:/root/.cache:rw \ - nvcr.io/nvidia/tensorrt-llm/release:gpt-oss-dev \ + nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc0 \ /bin/bash ``` @@ -53,9 +53,9 @@ Additionally, the container mounts your user `.cache` directory to save the down Support for gpt-oss has been [merged](https://github.com/NVIDIA/TensorRT-LLM/pull/6645) into the **main branch** of TensorRT-LLM. As we continue to optimize gpt-oss performance, you can build TensorRT-LLM from source to get the latest features and support. Please refer to the [doc](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html) if you want to build from source yourself. -### Regular Release of TensorRT-LLM +### TensorRT-LLM Python Wheel Install -Since gpt-oss has been supported on the main branch, you can get TensorRT-LLM out of the box through its regular release in the future. Please check the latest [release notes](https://github.com/NVIDIA/TensorRT-LLM/releases) to keep track of the support status. The release is provided as [NGC Container Image](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags) or [pip Python wheel](https://pypi.org/project/tensorrt-llm/#history). You can find instructions on pip install [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html). +Regular releases of TensorRT-LLM are also provided as [Python wheels](https://pypi.org/project/tensorrt-llm/#history). You can find instructions on the pip install [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html). ## Performance Benchmarking and Model Serving @@ -210,7 +210,10 @@ We can use `trtllm-serve` to serve the model by translating the benchmark comman ```bash trtllm-serve \ - gpt-oss-120b \ # Or ${local_model_path} +Note: You can also point to a local path containing the model weights instead of the HF repo (e.g., `${local_model_path}`). + +trtllm-serve \ + openai/gpt-oss-120b \ --host 0.0.0.0 \ --port 8000 \ --backend pytorch \ @@ -228,7 +231,8 @@ For max-throughput configuration, run: ```bash trtllm-serve \ - gpt-oss-120b \ # Or ${local_model_path} +trtllm-serve \ + openai/gpt-oss-120b \ --host 0.0.0.0 \ --port 8000 \ --backend pytorch \ @@ -262,7 +266,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d ' "messages": [ { "role": "user", - "content": "What is NVIDIA's advantage for inference?" + "content": "What is NVIDIAs advantage for inference?" } ], "max_tokens": 1024, @@ -348,12 +352,7 @@ others according to your needs. ## (H200/H100 Only) Using OpenAI Triton Kernels for MoE -OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM can leverage these kernels for Hopper-based GPUs like NVIDIA's H200 for optimal performance. `TRTLLM` MoE backend is not supported on Hopper, and `CUTLASS` backend support is still ongoing. Please enable `TRITON` backend with the steps below if you are running on Hopper GPUs. - -### Installing OpenAI Triton - -The `nvcr.io/nvidia/tensorrt-llm/release:gpt-oss-dev` has prepared Triton already (`echo $TRITON_ROOT` could reveal the path). In other situations, you will need to build and install a specific version of Triton. Please follow the instructions in this [link](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/gpt_oss#using-openai-triton-kernels-for-moe). - +OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM can leverage these kernels for Hopper-based GPUs like NVIDIA's H200 for optimal performance. `TRTLLM` MoE backend is not supported on Hopper, and `CUTLASS` backend support is still ongoing. Please follow the instructions in this [link](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/gpt_oss#using-openai-triton-kernels-for-moe) to install and enable the `TRITON` MoE kernels on Hopper GPUs. ### Selecting Triton as the MoE backend diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 61ed4978d96..f43d454ac8a 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1835,6 +1835,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) "H100_PCIe-TensorRT-Post-Merge-4": ["h100-cr", "l0_h100", 4, 5], "H100_PCIe-TensorRT-Post-Merge-5": ["h100-cr", "l0_h100", 5, 5], "B200_PCIe-Triton-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1], + "B200_PCIe-PyTorch-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1], "H100_PCIe-TensorRT-Perf-1": ["h100-cr", "l0_perf", 1, 1], "H100_PCIe-PyTorch-Perf-1": ["h100-cr", "l0_perf", 1, 1], "DGX_H200-8_GPUs-PyTorch-Post-Merge-1": ["dgx-h200-x8", "l0_dgx_h200", 1, 1, 8], diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index 751f2516358..bd46241e51d 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -11,7 +11,7 @@ # # NB: Typically, the suffix indicates the PR whose CI pipeline generated the images. In case that # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508130930-6501 -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508130930-6501 -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202508130930-6501 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202508130930-6501 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202508201630-pre-test +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202508201630-pre-test diff --git a/requirements.txt b/requirements.txt index e2582f50385..a7821f15db6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ --extra-index-url https://download.pytorch.org/whl/cu128 -c constraints.txt -accelerate>=0.25.0 +accelerate>=1.7.0 build colored cuda-python>=12,<13 diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index e40543c78f3..d7cd4c61f1a 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -329,10 +329,8 @@ def create_cuda_stub_links(cuda_stub_dir: str, missing_libs: list[str]) -> str: return str(temp_dir_path) -def check_missing_libs(so_prefix: str) -> list[str]: - result = build_run(f"ldd {so_prefix}.cpython*.so", - capture_output=True, - text=True) +def check_missing_libs(lib_name: str) -> list[str]: + result = build_run(f"ldd {lib_name}", capture_output=True, text=True) missing = [] for line in result.stdout.splitlines(): if "not found" in line: @@ -344,7 +342,7 @@ def check_missing_libs(so_prefix: str) -> list[str]: def generate_python_stubs_linux(binding_type: str, venv_python: Path, - deep_ep: bool): + deep_ep: bool, binding_lib_name: str): is_nanobind = binding_type == "nanobind" if is_nanobind: build_run(f"\"{venv_python}\" -m pip install nanobind") @@ -353,7 +351,7 @@ def generate_python_stubs_linux(binding_type: str, venv_python: Path, env_stub_gen = os.environ.copy() cuda_home_dir = env_stub_gen.get("CUDA_HOME") or env_stub_gen.get( "CUDA_PATH") or "/usr/local/cuda" - missing_libs = check_missing_libs("bindings") + missing_libs = check_missing_libs(binding_lib_name) cuda_stub_dir = f"{cuda_home_dir}/lib64/stubs" if missing_libs and Path(cuda_stub_dir).exists(): @@ -806,7 +804,9 @@ def get_binding_lib(subdirectory, name): ) == 1, f"Exactly one binding library should be present: {binding_lib}" return binding_lib[0] - install_file(get_binding_lib(binding_type, "bindings"), pkg_dir) + binding_lib_dir = get_binding_lib(binding_type, "bindings") + binding_lib_file_name = binding_lib_dir.name + install_file(binding_lib_dir, pkg_dir) with (build_dir / "tensorrt_llm" / "deep_ep" / "cuda_architectures.txt").open() as f: @@ -846,7 +846,7 @@ def get_binding_lib(subdirectory, name): else: # on linux generate_python_stubs_linux( binding_type, venv_python, - bool(deep_ep_cuda_architectures)) + bool(deep_ep_cuda_architectures), binding_lib_file_name) if not skip_building_wheel: if dist_dir is None: diff --git a/tensorrt_llm/_torch/attention_backend/__init__.py b/tensorrt_llm/_torch/attention_backend/__init__.py index c426b18f0a5..68805efa4a2 100644 --- a/tensorrt_llm/_torch/attention_backend/__init__.py +++ b/tensorrt_llm/_torch/attention_backend/__init__.py @@ -1,4 +1,4 @@ -from ..custom_ops import IS_FLASHINFER_AVAILABLE +from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE from .interface import AttentionBackend, AttentionMetadata from .trtllm import AttentionInputType, TrtllmAttention, TrtllmAttentionMetadata from .vanilla import VanillaAttention, VanillaAttentionMetadata diff --git a/tensorrt_llm/_torch/attention_backend/utils.py b/tensorrt_llm/_torch/attention_backend/utils.py index 911621f150b..b741ec37c73 100644 --- a/tensorrt_llm/_torch/attention_backend/utils.py +++ b/tensorrt_llm/_torch/attention_backend/utils.py @@ -1,7 +1,7 @@ from typing import Optional, Type from ...models.modeling_utils import QuantConfig -from . import IS_FLASHINFER_AVAILABLE +from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE from .interface import AttentionBackend, MLAParams, PositionalEmbeddingParams from .trtllm import TrtllmAttention from .vanilla import VanillaAttention diff --git a/tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py b/tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py index 00b535dec61..e0e21b1d70e 100644 --- a/tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py +++ b/tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py @@ -43,11 +43,13 @@ def _patch_unsupported_input_tensor(): """ original_fn = lowering.unsupported_input_tensor - def patched_fn(t: torch.Tensor, parent=None, node=None): + def patched_fn(t: torch.Tensor, *args, **kwargs): """Bypass meta tensor check.""" if t.is_meta: return False - return original_fn(t, parent, node) + return original_fn( + t, *args, **kwargs + ) # a generic pass-through of the arguments to accommodate torch side change lowering.unsupported_input_tensor = patched_fn try: diff --git a/tensorrt_llm/_torch/custom_ops/__init__.py b/tensorrt_llm/_torch/custom_ops/__init__.py index 8a81d1123a4..f8e46d3d29a 100644 --- a/tensorrt_llm/_torch/custom_ops/__init__.py +++ b/tensorrt_llm/_torch/custom_ops/__init__.py @@ -1,5 +1,6 @@ +from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE +from ..modules.attention import attn_custom_op_inplace, mla_custom_op_inplace from .cpp_custom_ops import _register_fake -from .flashinfer_custom_ops import IS_FLASHINFER_AVAILABLE from .torch_custom_ops import bmm_out from .trtllm_gen_custom_ops import fp8_block_scale_moe_runner from .userbuffers_custom_ops import add_to_ub, copy_to_userbuffers, matmul_to_ub @@ -12,6 +13,8 @@ 'add_to_ub', 'copy_to_userbuffers', 'matmul_to_ub', + 'attn_custom_op_inplace', + 'mla_custom_op_inplace', ] if IS_FLASHINFER_AVAILABLE: diff --git a/tensorrt_llm/_torch/custom_ops/flashinfer_custom_ops.py b/tensorrt_llm/_torch/custom_ops/flashinfer_custom_ops.py index 15e4b3750f2..223c9bd5b04 100644 --- a/tensorrt_llm/_torch/custom_ops/flashinfer_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/flashinfer_custom_ops.py @@ -1,35 +1,11 @@ -import os -import platform -import traceback - import torch -from ...logger import logger - -IS_FLASHINFER_AVAILABLE = False - - -def get_env_enable_pdl(): - return os.environ.get("TRTLLM_ENABLE_PDL", "0") == "1" - - -ENABLE_PDL = get_env_enable_pdl() -if ENABLE_PDL: - logger.info("PDL is enabled") - -if platform.system() != "Windows": - try: - import flashinfer - IS_FLASHINFER_AVAILABLE = True - except ImportError: - traceback.print_exc() - print( - "flashinfer is not installed properly, please try pip install or building from source codes" - ) +from ..flashinfer_utils import ENABLE_PDL, IS_FLASHINFER_AVAILABLE if IS_FLASHINFER_AVAILABLE: from flashinfer.activation import silu_and_mul from flashinfer.norm import fused_add_rmsnorm, rmsnorm + from flashinfer.rope import apply_rope_with_cos_sin_cache_inplace # Warp this into custom op since flashinfer didn't warp it properly and we want to avoid graph break between mlp layer for user buffer optimization @torch.library.custom_op("trtllm::flashinfer_silu_and_mul", mutates_args=()) @@ -69,7 +45,7 @@ def flashinfer_apply_rope_with_cos_sin_cache_inplace( cos_sin_cache: torch.Tensor, is_neox: bool = True, ) -> None: - flashinfer.apply_rope_with_cos_sin_cache_inplace( + apply_rope_with_cos_sin_cache_inplace( positions, query, key, diff --git a/tensorrt_llm/_torch/flashinfer_utils.py b/tensorrt_llm/_torch/flashinfer_utils.py new file mode 100644 index 00000000000..5b150665b5b --- /dev/null +++ b/tensorrt_llm/_torch/flashinfer_utils.py @@ -0,0 +1,27 @@ +import os +import platform +import traceback + +from ..logger import logger + +IS_FLASHINFER_AVAILABLE = False + + +def get_env_enable_pdl(): + return os.environ.get("TRTLLM_ENABLE_PDL", "0") == "1" + + +ENABLE_PDL = get_env_enable_pdl() +if ENABLE_PDL: + logger.info("PDL is enabled") + +if platform.system() != "Windows": + try: + import flashinfer + logger.info(f"flashinfer is available: {flashinfer.__version__}") + IS_FLASHINFER_AVAILABLE = True + except ImportError: + traceback.print_exc() + print( + "flashinfer is not installed properly, please try pip install or building from source codes" + ) diff --git a/tensorrt_llm/_torch/modules/rms_norm.py b/tensorrt_llm/_torch/modules/rms_norm.py index 39787b82b7a..2a22d858250 100644 --- a/tensorrt_llm/_torch/modules/rms_norm.py +++ b/tensorrt_llm/_torch/modules/rms_norm.py @@ -19,7 +19,7 @@ import torch from torch import nn -from ..custom_ops import IS_FLASHINFER_AVAILABLE +from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE class RMSNorm(nn.Module): diff --git a/tensorrt_llm/_torch/modules/rotary_embedding.py b/tensorrt_llm/_torch/modules/rotary_embedding.py index 0b0d5582687..35bf7759d2e 100644 --- a/tensorrt_llm/_torch/modules/rotary_embedding.py +++ b/tensorrt_llm/_torch/modules/rotary_embedding.py @@ -3,8 +3,8 @@ import torch from torch import nn -from ..attention_backend import IS_FLASHINFER_AVAILABLE from ..attention_backend.interface import RopeParams +from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE class RotaryEmbedding(nn.Module): diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 4cebfae58b1..a40b9b90459 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -1181,6 +1181,17 @@ def _forward_step_inter_pp(self, scheduled_batch) -> SampleState: def _validate_request(self, request: LlmRequest): if isinstance(self.model_engine.model, DecoderModelForCausalLM): + # Only skip token‐range checks for Llama4 when the request has multimodal data + from ..models.modeling_llama import Llama4ForConditionalGeneration + if isinstance(self.model_engine.model, + Llama4ForConditionalGeneration): + has_mm = bool(request.py_multimodal_data) + if has_mm: + logger.debug( + f"Skipping token-range validation for {type(self.model_engine.model).__name__} " + "(multimodal request)") + return + # FIXME: This check is necessary because of how Qwen2ForProcessRewardModel # subclasses DecoderModelForCausalLM. Perhaps the functionality # of DecoderModelForCausalLM reused by Qwen2ForProcessRewardModel diff --git a/tensorrt_llm/commands/eval.py b/tensorrt_llm/commands/eval.py index eff80d1a69d..32f4a420864 100644 --- a/tensorrt_llm/commands/eval.py +++ b/tensorrt_llm/commands/eval.py @@ -20,8 +20,8 @@ from .. import LLM as PyTorchLLM from .._tensorrt_engine import LLM -from ..evaluate import (GSM8K, MMLU, CnnDailymail, GPQADiamond, GPQAExtended, - GPQAMain, JsonModeEval) +from ..evaluate import (GSM8K, MMLU, MMMU, CnnDailymail, GPQADiamond, + GPQAExtended, GPQAMain, JsonModeEval) from ..llmapi import BuildConfig, KvCacheConfig from ..llmapi.llm_utils import update_llm_args_with_extra_options from ..logger import logger, severity_map @@ -152,6 +152,7 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str, main.add_command(GPQAMain.command) main.add_command(GPQAExtended.command) main.add_command(JsonModeEval.command) +main.add_command(MMMU.command) if __name__ == "__main__": main() diff --git a/tensorrt_llm/evaluate/__init__.py b/tensorrt_llm/evaluate/__init__.py index 58b789ecbef..89d87425338 100755 --- a/tensorrt_llm/evaluate/__init__.py +++ b/tensorrt_llm/evaluate/__init__.py @@ -15,10 +15,10 @@ from .cnn_dailymail import CnnDailymail from .json_mode_eval import JsonModeEval -from .lm_eval import GSM8K, GPQADiamond, GPQAExtended, GPQAMain +from .lm_eval import GSM8K, MMMU, GPQADiamond, GPQAExtended, GPQAMain from .mmlu import MMLU __all__ = [ "CnnDailymail", "MMLU", "GSM8K", "GPQADiamond", "GPQAMain", "GPQAExtended", - "JsonModeEval" + "JsonModeEval", "MMMU" ] diff --git a/tensorrt_llm/evaluate/lm_eval.py b/tensorrt_llm/evaluate/lm_eval.py index 920299b1030..c77e3be4479 100644 --- a/tensorrt_llm/evaluate/lm_eval.py +++ b/tensorrt_llm/evaluate/lm_eval.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import copy +import json import os from contextlib import contextmanager from typing import Dict, Iterable, List, Optional, Tuple, Union @@ -22,6 +23,7 @@ from tqdm import tqdm import tensorrt_llm.profiler as profiler +from tensorrt_llm.inputs import prompt_inputs try: from lm_eval.api.model import TemplateLM @@ -31,11 +33,18 @@ from .. import LLM as PyTorchLLM from .._tensorrt_engine import LLM +from ..inputs import (ConversationMessage, MultimodalDataTracker, + add_multimodal_placeholders, convert_image_mode) +from ..inputs.utils import apply_chat_template as trtllm_apply_chat_template from ..llmapi import RequestOutput from ..logger import logger from ..sampling_params import SamplingParams from .interface import Evaluator +# NOTE: lm_eval uses "" as the default image placeholder +# https://github.com/EleutherAI/lm-evaluation-harness/blob/7f04db12d2f8e7a99a0830d99eb78130e1ba2122/lm_eval/models/hf_vlms.py#L25 +LM_EVAL_DEFAULT_IMAGE_PLACEHOLDER = "" + class LmEvalWrapper(TemplateLM): @@ -125,6 +134,163 @@ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: return [output.outputs[0].text for output in outputs] +class MultimodalLmEvalWrapper(LmEvalWrapper): + """ + Multimodal wrapper for lm-evaluation-harness that handles vision-language models. + + This wrapper extends the base LmEvalWrapper to support multimodal inputs, + particularly for tasks that require both text and image processing. + """ + + def __init__(self, + llm: Union[LLM, PyTorchLLM], + sampling_params: Optional[SamplingParams] = None, + streaming: bool = False, + max_images: int = 999): + """ + Initialize the multimodal wrapper. + + Args: + llm: The language model instance (either TensorRT or PyTorch) + sampling_params: Parameters for text generation + streaming: Whether to use streaming generation + max_images: Maximum number of images per prompt (currently unlimited in TRT-LLM), set to 999 from lm_eval's default value. + """ + super().__init__(llm, sampling_params, streaming) + + # NOTE: Required by lm_eval to identify this as a multimodal model + self.MULTIMODAL = True + self.max_images = max_images + self.model_type = self._get_model_type(llm) + + # NOTE: In TRT-LLM, currently we do not support interleaved text and image. Instead, we are adding image placeholders at the end of the text or at the beginning of the text. + # So, until we support interleaved text and image, we set this to False. + self.interleave = False + + def _get_model_type(self, llm: Union[LLM, PyTorchLLM]) -> str: + """Extract model type from the model configuration.""" + config_path = os.path.join(llm._hf_model_dir, 'config.json') + + if not os.path.exists(config_path): + raise FileNotFoundError( + f"Model configuration file not found: {config_path}") + + try: + with open(config_path, 'r') as f: + config = json.load(f) + except json.JSONDecodeError as e: + raise ValueError( + f"Invalid JSON in model configuration file {config_path}: {e}") + + if 'model_type' not in config: + raise KeyError( + f"'model_type' key not found in model configuration: {config_path}" + ) + + return config['model_type'] + + def apply_chat_template(self, + chat_history: List[Dict[str, str]], + add_generation_prompt: bool = True) -> str: + """ + Apply chat template to multimodal conversation history. + + Converts text with image placeholders into structured format expected by + the multimodal processor. + + Adapted from: https://github.com/EleutherAI/lm-evaluation-harness/blob/7f04db12d2f8e7a99a0830d99eb78130e1ba2122/lm_eval/models/hf_vlms.py#L225 + """ + mm_placeholder_counts = [] + for i in range(len(chat_history)): + content = chat_history[i] + text = content["content"] + image_count = min(self.max_images, + text.count(LM_EVAL_DEFAULT_IMAGE_PLACEHOLDER)) + + if self.interleave: + # TODO: Implement interleaved text and image. + text.split(LM_EVAL_DEFAULT_IMAGE_PLACEHOLDER) + ... + else: + text = text.replace(LM_EVAL_DEFAULT_IMAGE_PLACEHOLDER, "") + + conv = ConversationMessage(role="user", content=text) + mm_data_tracker = MultimodalDataTracker(self.model_type) + + # NOTE: Since we already have loaded images, for the placeholder purpose, we add data here. + for _ in range(image_count): + mm_data_tracker.add_data("image", None) + mm_placeholder_count = mm_data_tracker.placeholder_counts() + if mm_placeholder_count: + # TODO: This is an assumption of not interleaving text and image. Need to extend to interleaved texts. + conv["content"] = add_multimodal_placeholders( + self.model_type, conv["content"], mm_placeholder_count) + mm_placeholder_counts.append(mm_placeholder_count) + chat_history[i] = conv + + output = trtllm_apply_chat_template( + model_type=self.model_type, + tokenizer=self.llm.tokenizer, + processor=self.llm.input_processor.processor, + conversation=chat_history, + add_generation_prompt=add_generation_prompt, + mm_placeholder_counts=mm_placeholder_counts, + tools=None, + chat_template_kwargs={ + "continue_final_message": not add_generation_prompt + }) + return output + + def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: + """ + Generate text responses for multimodal requests. + + This method processes multimodal requests that include both text prompts + and visual data (images). + + Args: + requests: List of multimodal generation requests + disable_tqdm: Whether to disable progress bars + + Returns: + List of generated text responses + """ + profiler.start("trtllm exec") + results = [] + for request in tqdm(requests, + desc="Submitting requests", + disable=disable_tqdm): + + # NOTE: For now, only this part is different from the original generate_until + prompt, gen_kwargs, media_data = request.args + prompt = prompt_inputs(prompt) + + # NOTE: Convert RGBA format to RGB format + images = [ + convert_image_mode(img, "RGB") for img in media_data["visual"] + ] + prompt["multi_modal_data"] = {"image": images} + + sampling_params = self._get_sampling_params(gen_kwargs) + output = self.llm.generate_async(prompt, + sampling_params=sampling_params, + streaming=self.streaming) + results.append(output) + + outputs = [] + for output in tqdm(results, + desc="Fetching responses", + disable=disable_tqdm): + outputs.append(output.result()) + + profiler.stop("trtllm exec") + elapsed_time = profiler.elapsed_time_in_sec("trtllm exec") + logger.info(f"TRTLLM execution time: {elapsed_time:.3f} seconds.") + profiler.reset("trtllm exec") + + return [output.outputs[0].text for output in outputs] + + class LmEvalEvaluator(Evaluator): def __init__(self, @@ -134,7 +300,8 @@ def __init__(self, random_seed: int = 0, apply_chat_template: bool = False, fewshot_as_multiturn: bool = False, - system_prompt: Optional[str] = None): + system_prompt: Optional[str] = None, + is_multimodal: bool = False): try: import lm_eval except ImportError as e: @@ -143,6 +310,12 @@ def __init__(self, "Please install the package first, e.g., `pip install lm_eval`." ) from e import lm_eval.tasks + self.MULTIMODAL = is_multimodal + if self.MULTIMODAL: + apply_chat_template = True + logger.info( + "Chat template automatically enabled for multimodal evaluation." + ) super().__init__(random_seed=random_seed, apply_chat_template=apply_chat_template, fewshot_as_multiturn=fewshot_as_multiturn, @@ -156,12 +329,31 @@ def __init__(self, with self._patch_lm_eval(): self.task_dict = lm_eval.tasks.get_task_dict( task_name, task_manager=task_manager) - # Few-shot random seed - self.task_dict[self.task_name].set_fewshot_seed(random_seed) - # Shuffle dataset - data = self.task_dict[self.task_name].dataset - for split in data.keys(): - data[split] = data[split].shuffle(random_seed) + + # Adopted from https://github.com/EleutherAI/lm-evaluation-harness/blob/7f04db12d2f8e7a99a0830d99eb78130e1ba2122/lm_eval/evaluator.py#L290 + def _adjust_config(task_dict, random_seed): + adjusted_task_dict = {} + for task_name, task_obj in task_dict.items(): + if isinstance(task_obj, dict): + adjusted_task_dict = { + **adjusted_task_dict, + **{ + task_name: _adjust_config(task_obj, random_seed) + }, + } + else: + # NOTE: Few-shot random seed + task_obj.set_fewshot_seed(seed=random_seed) + adjusted_task_dict[task_name] = task_obj + + # NOTE: Shuffle dataset + data = adjusted_task_dict[task_name].dataset + for split in data.keys(): + data[split] = data[split].shuffle(random_seed) + + return adjusted_task_dict + + self.task_dict = _adjust_config(self.task_dict, random_seed) @contextmanager def _patch_lm_eval(self): @@ -196,8 +388,9 @@ def evaluate(self, streaming: bool = False, scores_filter: str = None) -> float: import lm_eval + lm_cls = MultimodalLmEvalWrapper if self.MULTIMODAL else LmEvalWrapper results = lm_eval.evaluate( - lm=LmEvalWrapper(llm, sampling_params, streaming), + lm=lm_cls(llm, sampling_params, streaming), task_dict=self.task_dict, limit=self.num_samples, apply_chat_template=self.apply_chat_template, @@ -226,6 +419,7 @@ def evaluate(self, @classmethod def command_harness(cls, ctx, **kwargs): llm: Union[LLM, PyTorchLLM] = ctx.obj + evaluator = cls(dataset_path=kwargs.pop("dataset_path", None), num_samples=kwargs.pop("num_samples", None), random_seed=kwargs.pop("random_seed", 0), @@ -233,10 +427,12 @@ def command_harness(cls, ctx, **kwargs): False), fewshot_as_multiturn=kwargs.pop("fewshot_as_multiturn", False), - system_prompt=kwargs.pop("system_prompt", None)) + system_prompt=kwargs.pop("system_prompt", None), + is_multimodal=kwargs.pop("is_multimodal", False)) sampling_params = SamplingParams( max_tokens=kwargs.pop("max_output_length"), - truncate_prompt_tokens=kwargs.pop("max_input_length")) + truncate_prompt_tokens=kwargs.pop("max_input_length"), + stop=kwargs.pop("stop", None)) evaluator.evaluate(llm, sampling_params) llm.shutdown() @@ -419,3 +615,52 @@ def __init__(self, **kwargs): @staticmethod def command(ctx, **kwargs) -> None: GPQAExtended.command_harness(ctx, **kwargs) + + +class MMMU(LmEvalEvaluator): + + def __init__(self, **kwargs): + super().__init__("mmmu_val", **kwargs) + + @click.command("mmmu") + @click.option("--dataset_path", + type=str, + default=None, + help="The path to MMMU dataset. " + "If unspecified, the dataset is downloaded from HF hub.") + @click.option( + "--num_samples", + type=int, + default=None, + help="Number of samples to run the evaluation; None means full dataset." + ) + @click.option("--random_seed", + type=int, + default=0, + help="Random seed for dataset processing.") + @click.option( + "--system_prompt", + type=str, + default=None, + help= + "The system prompt to be added on the prompt. If specified, it will add {'role': 'system', 'content': system_prompt} to the prompt." + ) + @click.option("--max_input_length", + type=int, + default=8192, + help="Maximum prompt length.") + @click.option( + "--max_output_length", + type=int, + default= + 512, # NOTE: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mmmu/_template_yaml#L13 + help="Maximum generation length.") + @click.pass_context + @staticmethod + def command(ctx, **kwargs) -> None: + # NOTE: MMMU is a multimodal task, so we need to set the is_multimodal and apply_chat_template flags to True + kwargs["is_multimodal"] = True + kwargs["apply_chat_template"] = True + kwargs[ + "stop"] = "<|endoftext|>" # NOTE: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mmmu/_template_yaml#L10 + MMMU.command_harness(ctx, **kwargs) diff --git a/tensorrt_llm/inputs/__init__.py b/tensorrt_llm/inputs/__init__.py index f7e5ce97d7f..070b8449cee 100644 --- a/tensorrt_llm/inputs/__init__.py +++ b/tensorrt_llm/inputs/__init__.py @@ -8,9 +8,9 @@ from .utils import (ALL_SUPPORTED_AUDIO_MODELS, ALL_SUPPORTED_IMAGE_MODELS, ALL_SUPPORTED_MULTIMODAL_MODELS, ALL_SUPPORTED_VIDEO_MODELS, ConversationMessage, MultimodalData, MultimodalDataTracker, - add_multimodal_placeholders, async_load_audio, - async_load_image, async_load_video, - default_multimodal_input_loader, + add_multimodal_placeholders, apply_chat_template, + async_load_audio, async_load_image, async_load_video, + convert_image_mode, default_multimodal_input_loader, encode_base64_content_from_url, load_image, load_video) __all__ = [ @@ -37,6 +37,8 @@ "async_load_image", "async_load_video", "add_multimodal_placeholders", + "apply_chat_template", + "convert_image_mode", "default_multimodal_input_loader", "encode_base64_content_from_url", "load_image", diff --git a/tensorrt_llm/inputs/utils.py b/tensorrt_llm/inputs/utils.py index 3b856a2bfbd..a5e0c0a5a42 100644 --- a/tensorrt_llm/inputs/utils.py +++ b/tensorrt_llm/inputs/utils.py @@ -72,11 +72,14 @@ def load_base64_image(parsed_url: str) -> Image.Image: return image -def load_image(image: str, +def load_image(image: Union[str, Image.Image], format: str = "pt", device: str = "cpu") -> Union[Image.Image, torch.Tensor]: assert format in ["pt", "pil"], "format must be either Pytorch or PIL" + if isinstance(image, Image.Image): + return image.convert('RGB') + parsed_url = urlparse(image) if parsed_url.scheme in ["http", "https"]: @@ -94,11 +97,14 @@ def load_image(image: str, async def async_load_image( - image: str, + image: Union[str, Image.Image], format: str = "pt", device: str = "cpu") -> Union[Image.Image, torch.Tensor]: assert format in ["pt", "pil"], "format must be either Pytorch or PIL" + if isinstance(image, Image.Image): + return image.convert('RGB') + parsed_url = urlparse(image) if parsed_url.scheme in ["http", "https"]: @@ -386,12 +392,13 @@ def resolve_hf_chat_template( def handle_placeholder_exceptions(model_type: str, conversation: list[ConversationMessage], - mm_placeholder_counts: dict[str, int]): + mm_placeholder_counts: list[dict[str, int]]): if model_type == "llava_next": # we need to convert the flattened content back to conversation format - for conv in conversation: + for conv, mm_placeholder_count in zip(conversation, + mm_placeholder_counts): conv["content"] = [{"type": "text", "text": conv["content"]}, \ - *[{"type": "image"} for _ in mm_placeholder_counts]] + *[{"type": "image"} for _ in range(mm_placeholder_count[''])]] else: raise ValueError(f"This path should not be reached for: {model_type}") return conversation @@ -572,7 +579,10 @@ def convert_to_conversation_message( # Check if mdata is a MultimodalData if isinstance(mdata, dict) and "modality" in mdata and "data" in mdata: - mm_data_tracker.add_data(mdata["modality"], mdata["data"]) + modality = mdata["modality"] + if modality == "multiple_image": + modality = "image" + mm_data_tracker.add_data(modality, mdata["data"]) else: # Add embeddings to the tracker for placeholder handling mm_data_tracker.add_data(mdata["modality"], diff --git a/tensorrt_llm/serve/chat_utils.py b/tensorrt_llm/serve/chat_utils.py index ec67d469bc0..687799fb10a 100644 --- a/tensorrt_llm/serve/chat_utils.py +++ b/tensorrt_llm/serve/chat_utils.py @@ -179,6 +179,7 @@ def parse_chat_messages_coroutines( Any, Any, Optional[Dict[str, List[Any]]]]]]: """Parse multiple chat messages and return conversation and coroutine.""" conversation = [] + mm_placeholder_counts = [] mm_data_tracker = MultimodalDataTracker(model_config.model_type) for msg in messages: @@ -187,11 +188,12 @@ def parse_chat_messages_coroutines( if parsed_msg["media"]: for mdata in parsed_msg["media"]: mm_data_tracker.add_data(mdata["modality"], mdata["data"]) - mm_placeholder_counts = mm_data_tracker.placeholder_counts() - if mm_placeholder_counts: - parsed_msg["content"] = add_multimodal_placeholders( - model_config.model_type, parsed_msg["content"], - mm_placeholder_counts) + mm_placeholder_count = mm_data_tracker.placeholder_counts() + if mm_placeholder_count: + parsed_msg["content"] = add_multimodal_placeholders( + model_config.model_type, parsed_msg["content"], + mm_placeholder_count) + mm_placeholder_counts.append(mm_placeholder_count) return conversation, mm_data_tracker.retrieve_all_async( ), mm_placeholder_counts diff --git a/tests/integration/defs/.test_durations b/tests/integration/defs/.test_durations index 23a7d075d94..c85dc65bfe7 100644 --- a/tests/integration/defs/.test_durations +++ b/tests/integration/defs/.test_durations @@ -138,7 +138,16 @@ "disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-True-DeepSeek-V3-Lite-fp8/fp8]": 67.32832619687542, "disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_llama[True-False-TinyLlama-1.1B-Chat-v1.0]": 46.302398771978915, "disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_llama[True-True-TinyLlama-1.1B-Chat-v1.0]": 38.81214914191514, - "test_unittests.py::test_unittests_v2[unittest/_torch -k \"not (modeling or multi_gpu or auto_deploy)\"]": 1186.6702785710804, + "test_unittests.py::test_unittests_v2[unittest/_torch/attention]": 588.56, + "test_unittests.py::test_unittests_v2[unittest/_torch/compilation]": 31.94, + "test_unittests.py::test_unittests_v2[unittest/_torch/debugger]": 36.69, + "test_unittests.py::test_unittests_v2[unittest/_torch/executor]": 170.86, + "test_unittests.py::test_unittests_v2[unittest/_torch/misc]": 600.50, + "test_unittests.py::test_unittests_v2[unittest/_torch/modules]": 158.50, + "test_unittests.py::test_unittests_v2[unittest/_torch/multimodal]": 23.54, + "test_unittests.py::test_unittests_v2[unittest/_torch/sampler]": 107.66, + "test_unittests.py::test_unittests_v2[unittest/_torch/speculative]": 1850.16, + "test_unittests.py::test_unittests_v2[unittest/_torch/thop]": 852.56, "test_unittests.py::test_unittests_v2[unittest/_torch/modeling -k \"modeling_mixtral\"]": 208.1838396479725, "test_unittests.py::test_unittests_v2[unittest/_torch/multi_gpu_modeling -k \"deepseek\"]": 393.0210295501165, "cpp/test_e2e.py::test_model[-gpt_executor-80]": 4016.7569622844458, @@ -238,7 +247,7 @@ "disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_llama[False-False-TinyLlama-1.1B-Chat-v1.0]": 48.16434509307146, "test_e2e.py::test_trtllm_bench_iteration_log[PyTorch-non-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]": 163.86223009089008, "test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-instruct-hf-fp8-True-True]": 115.74023819994181, - "test_unittests.py::test_unittests_v2[unittest/_torch -k \"modeling_llama\"]": 718.749935634085, + "test_unittests.py::test_unittests_v2[unittest/_torch/modeling -k \"modeling_llama\"]": 718.749935634085, "accuracy/test_cli_flow.py::TestGpt2::test_int8_kv_cache": 399.65961667895317, "accuracy/test_cli_flow.py::TestLlama3_2_1B::test_int4_awq_int8_kv_cache": 392.90223736315966, "accuracy/test_cli_flow.py::TestQwen2_7BInstruct::test_int4_awq_prequantized": 604.7383968606591, @@ -280,7 +289,7 @@ "disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0]": 67.3897166326642, "disaggregated/test_disaggregated.py::test_disaggregated_overlap[TinyLlama-1.1B-Chat-v1.0]": 98.97588296607137, "disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]": 67.9668476767838, - "test_unittests.py::test_unittests_v2[unittest/_torch/test_attention_mla.py]": 26.32902159006335, + "test_unittests.py::test_unittests_v2[unittest/_torch/attention/test_attention_mla.py]": 26.32902159006335, "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]": 591.2785023800097, "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]": 306.84709841990843, "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]": 220.57452515885234, @@ -292,7 +301,6 @@ "test_e2e.py::test_ptp_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B]": 109.26379436196294, "test_e2e.py::test_ptp_quickstart_advanced_mixed_precision": 80.88908524392173, "test_e2e.py::test_ptp_quickstart_advanced_mtp[DeepSeek-V3-Lite-BF16-DeepSeek-V3-Lite/bf16]": 99.42739840806462, - "test_unittests.py::test_unittests_v2[unittest/_torch/speculative/test_eagle3.py]": 317.8708840459585, "accuracy/test_cli_flow.py::TestLlama7B::test_auto_dtype": 402.75543826818466, "examples/test_bert.py::test_llm_bert_general[compare_hf-disable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:1-pp:1-float32-BertModel-bert/bert-base-uncased]": 111.17977902293205, "examples/test_mamba.py::test_llm_mamba_1gpu[mamba-130m-float16-enable_gemm_plugin]": 112.04011878371239, diff --git a/tests/integration/defs/accuracy/accuracy_core.py b/tests/integration/defs/accuracy/accuracy_core.py index 35234e42ef6..06303886292 100644 --- a/tests/integration/defs/accuracy/accuracy_core.py +++ b/tests/integration/defs/accuracy/accuracy_core.py @@ -342,6 +342,26 @@ class JsonModeEval(AccuracyTask): apply_chat_template=True) +class MMMU(AccuracyTask): + DATASET = "mmmu" + DATASET_DIR = f"{llm_models_root()}/datasets/MMMU" + + ALPHA = 0.05 + BETA = 0.2 + SIGMA = 50 + NUM_SAMPLES = 900 + + MAX_BATCH_SIZE = 128 + MAX_INPUT_LEN = 8192 + MAX_OUTPUT_LEN = 512 + + EVALUATOR_CLS = tensorrt_llm.evaluate.MMMU + EVALUATOR_KWARGS = dict(dataset_path=DATASET_DIR, + random_seed=0, + is_multimodal=True, + apply_chat_template=True) + + class PassKeyRetrieval64k(AccuracyTask): DATASET = "passkey_retrieval_64k" LEVEL = 3 diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml new file mode 100644 index 00000000000..6edc728210b --- /dev/null +++ b/tests/integration/defs/accuracy/references/mmmu.yaml @@ -0,0 +1,2 @@ +Qwen/Qwen2-VL-7B-Instruct: + - accuracy: 48.44 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index f0a8e923289..8879904627e 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -31,7 +31,7 @@ parametrize_with_ids, skip_no_hopper, skip_post_blackwell, skip_pre_ada, skip_pre_blackwell, skip_pre_hopper) -from .accuracy_core import (GSM8K, MMLU, CnnDailymail, GPQADiamond, +from .accuracy_core import (GSM8K, MMLU, MMMU, CnnDailymail, GPQADiamond, JsonModeEval, LlmapiAccuracyTestHarness) @@ -1734,6 +1734,7 @@ class TestKimiK2(LlmapiAccuracyTestHarness): MODEL_PATH = f"{llm_models_root()}/Kimi-K2-Instruct" @pytest.mark.skip_less_mpi_world_size(8) + @skip_post_blackwell @skip_pre_hopper @pytest.mark.parametrize( "tp_size,pp_size,ep_size,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size", @@ -2284,7 +2285,10 @@ def test_nvfp4( pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, **pytorch_config, - enable_attention_dp=attention_dp) as llm: + enable_attention_dp=attention_dp, + max_batch_size=32) as llm: + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -2442,11 +2446,12 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, [ (8, 1, 8, True, True, True, "CUTLASS", False), (8, 1, 8, True, True, True, "TRTLLM", False), - (8, 1, 8, False, False, False, "TRTLLM", True), + (8, 1, 8, True, True, True, "TRTLLM", True), ], ids=[ - "latency_moe_cutlass", "latency_moe_trtllm", - "latency_moe_trtllm_eagle3" + "latency_moe_cutlass", + "latency_moe_trtllm", + "latency_moe_trtllm_eagle3", ], ) def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, @@ -2481,6 +2486,50 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + @skip_pre_blackwell + @pytest.mark.skip_less_mpi_world_size(4) + @pytest.mark.parametrize( + "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3", + [ + (4, 1, 4, False, False, False, "TRTLLM", + True), # TP8 has bug when we use TRTLLM moe backend and eagle3 + ], + ids=[ + "latency_moe_trtllm_eagle3", + ], + ) + def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp, + cuda_graph, overlap_scheduler, moe_backend, eagle3): + + pytorch_config = dict( + disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig() if cuda_graph else None, + moe_config=MoeConfig(backend=moe_backend)) + + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4, + enable_block_reuse=not eagle3) + spec_config = None + if eagle3: + spec_config = EagleDecodingConfig( + max_draft_len=2, + speculative_model_dir= + f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/", + eagle3_one_model=True) + with LLM( + f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + **pytorch_config, + enable_attention_dp=attention_dp, + kv_cache_config=kv_cache_config, + speculative_config=spec_config) as llm: + + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "microsoft/Phi-4-mini-instruct" @@ -2690,3 +2739,22 @@ def test_auto_dtype(self): task.evaluate(llm) task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + + +class TestQwen2_VL_7B(LlmapiAccuracyTestHarness): + MODEL_NAME = "Qwen/Qwen2-VL-7B-Instruct" + MODEL_PATH = f"{llm_models_root()}/Qwen2-VL-7B-Instruct" + + # NOTE: MMMU adds <|endoftext|> to the stop token. + sampling_params = SamplingParams(max_tokens=MMMU.MAX_OUTPUT_LEN, + truncate_prompt_tokens=MMMU.MAX_INPUT_LEN, + stop="<|endoftext|>") + + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) + + def test_auto_dtype(self): + with LLM(self.MODEL_PATH, + max_num_tokens=16384, + kv_cache_config=self.kv_cache_config) as llm: + task = MMMU(self.MODEL_NAME) + task.evaluate(llm, sampling_params=self.sampling_params) diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index c427dea2bc7..e28f1bcecd4 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -579,7 +579,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency] accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8 @@ -589,6 +589,7 @@ accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestQwen2_VL_7B::test_auto_dtype accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype @@ -630,7 +631,6 @@ test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-S test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1] test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video-False] -test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image-False] test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-video-False] test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-image-False] diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt index c977a77d3c2..51c452cbc78 100644 --- a/tests/integration/test_lists/qa/llm_function_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_sanity.txt @@ -116,7 +116,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True] -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM] @@ -164,7 +164,6 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Lla test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B] test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] -test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] diff --git a/tests/integration/test_lists/test-db/l0_a30.yml b/tests/integration/test_lists/test-db/l0_a30.yml index 5ec16996e7c..06ccf6fb2ad 100644 --- a/tests/integration/test_lists/test-db/l0_a30.yml +++ b/tests/integration/test_lists/test-db/l0_a30.yml @@ -19,7 +19,7 @@ l0_a30: - unittest/_torch/modeling -k "modeling_qwen" - unittest/_torch/modeling -k "modeling_qwen_moe" - unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison" - - unittest/_torch/test_beam_search.py + - unittest/_torch/sampler/test_beam_search.py - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index cb36129a147..ae0d0bd0411 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -20,22 +20,10 @@ l0_b200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=nvfp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True] @@ -66,15 +54,21 @@ l0_b200: - test_e2e.py::test_ptp_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B] - test_e2e.py::test_ptp_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct] - test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-False-False] - - unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" TIMEOUT (120) - - unittest/_torch -k "modeling_llama" + - unittest/_torch/attention + - unittest/_torch/compilation + - unittest/_torch/debugger + - unittest/_torch/executor + - unittest/_torch/misc + - unittest/_torch/modules + - unittest/_torch/multimodal + - unittest/_torch/sampler + - unittest/_torch/speculative + - unittest/_torch/thop + - unittest/_torch/modeling -k "modeling_llama" - unittest/_torch/modeling -k "modeling_mixtral" - unittest/_torch/modeling -k "modeling_deepseek" - unittest/_torch/modeling -k "modeling_gpt_oss" - unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison" - - unittest/_torch/speculative/test_eagle3.py - - unittest/_torch/speculative/test_kv_cache_reuse.py - - unittest/_torch/speculative/test_dynamic_spec_decode.py - condition: ranges: system_gpu_count: @@ -100,7 +94,6 @@ l0_b200: - unittest/trt/attention/test_gpt_attention.py -k "trtllm_gen" - unittest/llmapi/test_llm_quant.py # 3.5 mins on B200 - unittest/trt/functional/test_fp4_gemm.py # 3 mins on B200 - - condition: ranges: system_gpu_count: @@ -117,3 +110,29 @@ l0_b200: - triton_server/test_triton.py::test_llava[llava] - triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] - triton_server/test_triton.py::test_gpt_2b_ib_lora[gpt-2b-ib-lora] +- condition: + ranges: + system_gpu_count: + gte: 1 + lte: 1 + wildcards: + gpu: + - '*b100*' + linux_distribution_name: ubuntu* + terms: + stage: post_merge + backend: pytorch + tests: + # ------------- PyTorch tests --------------- + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] diff --git a/tests/integration/test_lists/test-db/l0_gb200.yml b/tests/integration/test_lists/test-db/l0_gb200.yml index ac39fbdc88c..7d1cc92fef5 100644 --- a/tests/integration/test_lists/test-db/l0_gb200.yml +++ b/tests/integration/test_lists/test-db/l0_gb200.yml @@ -69,3 +69,4 @@ l0_gb200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2] + - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] TIMEOUT (90) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml index d4d2745e3f6..857319c44c2 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml @@ -17,6 +17,5 @@ l0_gb200_multi_nodes: - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] TIMEOUT (180) + - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90) + - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90) diff --git a/tests/integration/test_lists/test-db/l0_gb202.yml b/tests/integration/test_lists/test-db/l0_gb202.yml index 3026d6e449d..4d2638e95f5 100644 --- a/tests/integration/test_lists/test-db/l0_gb202.yml +++ b/tests/integration/test_lists/test-db/l0_gb202.yml @@ -20,7 +20,7 @@ l0_gb202: - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[dtype0] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[dtype1] # - unittest/_torch/modeling -k "modeling_qwen" # https://nvbugs/5234573 - - unittest/_torch/test_attention_mla.py + - unittest/_torch/attention/test_attention_mla.py - test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] - test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[latency] diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index a52b515e644..bc840823171 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -14,14 +14,23 @@ l0_h100: backend: pytorch tests: # ------------- PyTorch tests --------------- + - unittest/_torch/attention + - unittest/_torch/compilation + - unittest/_torch/debugger + - unittest/_torch/executor + - unittest/_torch/misc + - unittest/_torch/modules + - unittest/_torch/multimodal + - unittest/_torch/sampler + - unittest/_torch/speculative + - unittest/_torch/thop # Only key models in H100: llama/mixtral/nemotron/deepseek - - unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py::test_trtllm_bench_backend_comparison - - unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" TIMEOUT (90) - - unittest/_torch -k "modeling_llama" + - unittest/_torch/modeling -k "modeling_llama" - unittest/_torch/modeling -k "modeling_mixtral" - unittest/_torch/modeling -k "modeling_nemotron" - unittest/_torch/modeling -k "modeling_gemma3" - unittest/_torch/modeling -k "modeling_gpt_oss" + - unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py::test_trtllm_bench_backend_comparison - unittest/disaggregated/test_disagg_utils.py - unittest/disaggregated/test_router.py - unittest/disaggregated/test_remoteDictionary.py diff --git a/tests/integration/test_lists/test-db/l0_l40s.yml b/tests/integration/test_lists/test-db/l0_l40s.yml index bfb7ed58ea8..622080941b2 100644 --- a/tests/integration/test_lists/test-db/l0_l40s.yml +++ b/tests/integration/test_lists/test-db/l0_l40s.yml @@ -37,6 +37,7 @@ l0_l40s: - test_e2e.py::test_openai_chat_with_logit_bias[torch_sampler] - test_e2e.py::test_openai_completions_with_logit_bias[trtllm_sampler] - test_e2e.py::test_openai_chat_with_logit_bias[trtllm_sampler] + - accuracy/test_llm_api_pytorch.py::TestQwen2_VL_7B::test_auto_dtype - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml index 32a03fd591c..49c29329dc7 100644 --- a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml +++ b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml @@ -17,7 +17,7 @@ l0_rtx_pro_6000: - unittest/_torch/modeling -k "modeling_mllama" - unittest/_torch/modeling -k "modeling_out_of_tree" # - unittest/_torch/modeling -k "modeling_qwen" # https://nvbugs/5234573 - - unittest/_torch/test_attention_mla.py + - unittest/_torch/attention/test_attention_mla.py - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[dtype0] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[dtype1] - test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index aed69261062..86af57819b3 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -263,7 +263,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5434451) examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451) examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451) -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437405,https://nvbugs/5437384) accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 SKIP (https://nvbugs/5440241) test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095) test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095) @@ -287,13 +286,9 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=False] SKIP (https://nvbugs/5403818) accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5403818) test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8] SKIP (https://nvbugs/5453992) -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] SKIP (https://nvbugs/5454898) -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5454898) +accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype SKIP (https://nvbugs/5454875) examples/test_llm_api_with_mpi.py::test_llm_api_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5434372) triton_server/test_triton.py::test_gpt_ib[gpt-ib] SKIP (https://nvbugs/5431116) -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] SKIP (https://nvbugs/5457489) -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5457489) -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5457489) disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5457504) accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8 SKIP (https://nvbugs/5413197) triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (https://nvbugs/5371349) @@ -319,3 +314,8 @@ disaggregated/test_disaggregated.py::test_disaggregated_diff_max_tokens[TinyLlam disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5465642) examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5431146) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] SKIP (https://nvbugs/5464461) +disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5448449) +full:H100/accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=True] SKIP (https://nvbugs/5467815) +full:H100/accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=False] SKIP (https://nvbugs/5467815) +full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True] SKIP (https://nvbugs/5467815) +full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True] SKIP (https://nvbugs/5467815) diff --git a/tests/unittest/_torch/test_attention.py b/tests/unittest/_torch/attention/test_attention.py similarity index 100% rename from tests/unittest/_torch/test_attention.py rename to tests/unittest/_torch/attention/test_attention.py diff --git a/tests/unittest/_torch/test_attention_mla.py b/tests/unittest/_torch/attention/test_attention_mla.py similarity index 100% rename from tests/unittest/_torch/test_attention_mla.py rename to tests/unittest/_torch/attention/test_attention_mla.py diff --git a/tests/unittest/_torch/test_attention_no_cache.py b/tests/unittest/_torch/attention/test_attention_no_cache.py similarity index 100% rename from tests/unittest/_torch/test_attention_no_cache.py rename to tests/unittest/_torch/attention/test_attention_no_cache.py diff --git a/tests/unittest/_torch/test_flashinfer_attention.py b/tests/unittest/_torch/attention/test_flashinfer_attention.py similarity index 100% rename from tests/unittest/_torch/test_flashinfer_attention.py rename to tests/unittest/_torch/attention/test_flashinfer_attention.py diff --git a/tests/unittest/_torch/test_flashinfer_star_attn.py b/tests/unittest/_torch/attention/test_flashinfer_star_attn.py similarity index 100% rename from tests/unittest/_torch/test_flashinfer_star_attn.py rename to tests/unittest/_torch/attention/test_flashinfer_star_attn.py diff --git a/tests/unittest/_torch/test_vanilla_attention.py b/tests/unittest/_torch/attention/test_vanilla_attention.py similarity index 100% rename from tests/unittest/_torch/test_vanilla_attention.py rename to tests/unittest/_torch/attention/test_vanilla_attention.py diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py index 6df4b188ac6..1ff956580ee 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py @@ -600,6 +600,7 @@ def test_trtllm_bench(llm_root): # noqa: F811 run_benchmark(model_name, dataset_path, temp_dir) +@pytest.mark.skip(reason="https://nvbugs/5458798") @pytest.mark.no_xdist def test_trtllm_bench_backend_comparison(llm_root): # noqa: F811 """Test that compares autodeploy backend performance against pytorch backend diff --git a/tests/unittest/_torch/test_executor_request_queue.py b/tests/unittest/_torch/executor/test_executor_request_queue.py similarity index 100% rename from tests/unittest/_torch/test_executor_request_queue.py rename to tests/unittest/_torch/executor/test_executor_request_queue.py diff --git a/tests/unittest/_torch/test_overlap_scheduler.py b/tests/unittest/_torch/executor/test_overlap_scheduler.py similarity index 100% rename from tests/unittest/_torch/test_overlap_scheduler.py rename to tests/unittest/_torch/executor/test_overlap_scheduler.py diff --git a/tests/unittest/_torch/test_overlap_scheduler_input.json b/tests/unittest/_torch/executor/test_overlap_scheduler_input.json similarity index 100% rename from tests/unittest/_torch/test_overlap_scheduler_input.json rename to tests/unittest/_torch/executor/test_overlap_scheduler_input.json diff --git a/tests/unittest/_torch/test_pytorch_model_engine.py b/tests/unittest/_torch/executor/test_pytorch_model_engine.py similarity index 100% rename from tests/unittest/_torch/test_pytorch_model_engine.py rename to tests/unittest/_torch/executor/test_pytorch_model_engine.py diff --git a/tests/unittest/_torch/test_resource_manager.py b/tests/unittest/_torch/executor/test_resource_manager.py similarity index 99% rename from tests/unittest/_torch/test_resource_manager.py rename to tests/unittest/_torch/executor/test_resource_manager.py index 24320a993b3..dc67e9bbfcd 100644 --- a/tests/unittest/_torch/test_resource_manager.py +++ b/tests/unittest/_torch/executor/test_resource_manager.py @@ -23,7 +23,7 @@ LoraModule = tensorrt_llm.bindings.LoraModule LoraModuleType = tensorrt_llm.bindings.LoraModuleType current_dir = pathlib.Path(__file__).parent.resolve() -root_dir = current_dir.parent.parent.parent +root_dir = current_dir.parent.parent.parent.parent sys.path.append(str(root_dir / "tests" / "integration")) @@ -44,6 +44,8 @@ def setUpClass(cls): """ cpp_script_dir = os.path.join(cls.CPP_RESOURCES_DIR, "scripts") + # No reason to run this script for each test. + # TODO: move this to a fixture that runs once. generate_lora_data_args_tp1 = [ sys.executable, f"{cpp_script_dir}/generate_test_lora_weights.py", diff --git a/tests/unittest/_torch/test_autotuner.py b/tests/unittest/_torch/misc/test_autotuner.py similarity index 100% rename from tests/unittest/_torch/test_autotuner.py rename to tests/unittest/_torch/misc/test_autotuner.py diff --git a/tests/unittest/_torch/test_share_tensor.py b/tests/unittest/_torch/misc/test_share_tensor.py similarity index 100% rename from tests/unittest/_torch/test_share_tensor.py rename to tests/unittest/_torch/misc/test_share_tensor.py diff --git a/tests/unittest/_torch/test_virtual_memory.py b/tests/unittest/_torch/misc/test_virtual_memory.py similarity index 100% rename from tests/unittest/_torch/test_virtual_memory.py rename to tests/unittest/_torch/misc/test_virtual_memory.py diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py index 2d11971d99e..397314bcab0 100644 --- a/tests/unittest/_torch/modules/test_fused_moe.py +++ b/tests/unittest/_torch/modules/test_fused_moe.py @@ -289,6 +289,7 @@ def per_rank_test_fused_moe_alltoall(job_id): assert r is None +@pytest.mark.skip(reason="https://nvbugs/5467531") @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="needs 4 GPUs to run this test") @pytest.mark.parametrize("alltoall_method_type", [ diff --git a/tests/unittest/_torch/test_group_rmn_norm.py b/tests/unittest/_torch/modules/test_group_rmn_norm.py similarity index 100% rename from tests/unittest/_torch/test_group_rmn_norm.py rename to tests/unittest/_torch/modules/test_group_rmn_norm.py diff --git a/tests/unittest/_torch/test_mnnvl_memory.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_memory.py similarity index 100% rename from tests/unittest/_torch/test_mnnvl_memory.py rename to tests/unittest/_torch/multi_gpu/test_mnnvl_memory.py diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py index 5c374d0f2aa..6149201d582 100644 --- a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py +++ b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py @@ -1,6 +1,7 @@ from difflib import SequenceMatcher import pytest +import torch from utils.llm_data import llm_models_root from tensorrt_llm import LLM, SamplingParams @@ -43,19 +44,17 @@ def test_llama4(model_name, backend, tp_size, use_cuda_graph, "This is a very long prompt to exercise long context. Count up to 10000 from 1, 2, 3," + ", ".join(str(i) for i in range(4, 9000)) }, - # TODO: Fix multimodal test. - # { - # "prompt": "<|image|>This image is of color", - # "multi_modal_data": { - # "image": [torch.ones(3, 1024, 1024)] - # } - # }, + { + "prompt": "<|image|>This image is of color", + "multi_modal_data": { + "image": [torch.ones(3, 1024, 1024)] + } + }, ] expected_outputs = [ - " the head of state and head of government of the", - ", 9000, 9001, ", - # " white. What is the color of the background of" # TODO: Fix multimodal test. + " the head of state and head of government of the", ", 9000, 9001, ", + " white. What is the color of the background of" ] pytorch_config = dict(attn_backend=backend) diff --git a/tests/unittest/_torch/test_beam_search.py b/tests/unittest/_torch/sampler/test_beam_search.py similarity index 100% rename from tests/unittest/_torch/test_beam_search.py rename to tests/unittest/_torch/sampler/test_beam_search.py diff --git a/tests/unittest/_torch/test_best_of_n.py b/tests/unittest/_torch/sampler/test_best_of_n.py similarity index 100% rename from tests/unittest/_torch/test_best_of_n.py rename to tests/unittest/_torch/sampler/test_best_of_n.py diff --git a/tests/unittest/_torch/test_return_logits.py b/tests/unittest/_torch/sampler/test_return_logits.py similarity index 100% rename from tests/unittest/_torch/test_return_logits.py rename to tests/unittest/_torch/sampler/test_return_logits.py diff --git a/tests/unittest/_torch/test_trtllm_sampler.py b/tests/unittest/_torch/sampler/test_trtllm_sampler.py similarity index 82% rename from tests/unittest/_torch/test_trtllm_sampler.py rename to tests/unittest/_torch/sampler/test_trtllm_sampler.py index d2fb0e9e65c..37227f9b53f 100644 --- a/tests/unittest/_torch/test_trtllm_sampler.py +++ b/tests/unittest/_torch/sampler/test_trtllm_sampler.py @@ -1,6 +1,3 @@ -import json -from pathlib import Path - import pytest from utils.llm_data import llm_models_root from utils.util import similar @@ -10,13 +7,6 @@ from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig -# A test case of mmlu_llama from lm_eval -@pytest.fixture(scope="module") -def test_case(): - with open(Path(__file__).parent / "test_overlap_scheduler_input.json") as f: - return json.load(f) - - @pytest.fixture(scope="module") def model_path(): return llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0" @@ -39,7 +29,7 @@ def create_llm(model_dir): @pytest.mark.high_cuda_memory -def test_trtllm_sampler(model_path, test_case): +def test_trtllm_sampler(model_path): prompts = [ "Magellan and Elcano lead the first", "The capital of France is", @@ -50,10 +40,10 @@ def test_trtllm_sampler(model_path, test_case): ["La Paz"]] # Test configuration - max_new_tokens = test_case["max_new_tokens"] - temperature = test_case["temperature"] - top_p = test_case["top_p"] - stop_words = test_case["stop_words"] + max_new_tokens = 10 + temperature = 1.0 + top_p = None + stop_words = ["."] sampling_config = SamplingParams(max_tokens=max_new_tokens, n=1, diff --git a/tests/unittest/_torch/test_custom_ops.py b/tests/unittest/_torch/thop/test_custom_ops.py similarity index 100% rename from tests/unittest/_torch/test_custom_ops.py rename to tests/unittest/_torch/thop/test_custom_ops.py diff --git a/tests/unittest/_torch/test_fp8_per_tensor_scale_tllmg_gemm.py b/tests/unittest/_torch/thop/test_fp8_per_tensor_scale_tllmg_gemm.py similarity index 100% rename from tests/unittest/_torch/test_fp8_per_tensor_scale_tllmg_gemm.py rename to tests/unittest/_torch/thop/test_fp8_per_tensor_scale_tllmg_gemm.py