diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index afcf5adcda1..d8dc6a7580b 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -50,8 +50,8 @@
 /tests/unittest/_torch/compilation @NVIDIA/trt-llm-torch-graph-compiler
 /tests/unittest/_torch/multi_gpu/test_ar_residual_norm.py @NVIDIA/trt-llm-torch-graph-compiler
 /tests/unittest/_torch/multi_gpu/test_user_buffers.py @NVIDIA/trt-llm-torch-graph-compiler
-/tests/unittest/_torch/test_custom_ops.py @NVIDIA/trt-llm-torch-graph-compiler
-/tests/unittest/_torch/test_autotuner.py @NVIDIA/trt-llm-torch-graph-compiler
+/tests/unittest/_torch/thop/test_custom_ops.py @NVIDIA/trt-llm-torch-graph-compiler
+/tests/unittest/_torch/misc/test_autotuner.py @NVIDIA/trt-llm-torch-graph-compiler
 
 ## TensorRT-LLM Pytorch - Attention
 /tensorrt_llm/_torch/attention_backend @NVIDIA/trt-llm-torch-attention-devs
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
index ec3f13abf45..9857e15e104 100644
--- a/.github/workflows/blossom-ci.yml
+++ b/.github/workflows/blossom-ci.yml
@@ -40,7 +40,7 @@ jobs:
         startsWith(github.event.comment.body, '/bot skip --comment') ||
         startsWith(github.event.comment.body, '/bot reuse-pipeline') ||
         startsWith(github.event.comment.body, '/bot kill')) && contains(
-        fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jhaotingc","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727","zackyoray","RoeyAzran1992","viraatc","v-shobhit","yuanjingx87","uchihatmtkinu","nvrohanv","vegaluisjose","qsang-nv","ChunhuanLin","timlee0212","venkywonka","zbpatel","tijyojwad","shyeh25","zihaok","nv-yilinf","ttyio","farazkh80","yuantailing","JennyLiu-nv","moraxu","IzzyPutterman","nvchenghaoz","nvxuanyuc","poweiw","stnie","zhanga5","nzmora-nvidia","greg-kwasniewski1","linda-stadter","Tom-Zheng","vanshilshah97","ixlmar","MatthiasKohl","Wanli-Jiang", "arekay", "davidclark-nv", "2ez4bz", "tcherckez-nvidia", "MrGeva", "galagam", "limin2021", "dhansen-nvidia","talorabr","kanghui0204","wu6u3tw","hvagadia","xavier-nvidia","raayandhar","dbari","nvjullin","elvischenv","zhenhuaw-me","weireweire","yifeizhang-c","jiaganc","ziyixiong-nv","FelixXidddd","JunyiXu-nv","bo-nv","zerollzeng","RayenTian","ameynaik-hub","raymochen","shuyixiong","johncalesp","leslie-fang25","reasonsolo","zhou-yuxin","vadiklyutiy","yali-arch","NVShreyas","h-guo18","pengbowang-nv","lancelly","heyuhhh","mayani-nv","flin3500","sunnyqgg","kris1025"]'),
+        fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jhaotingc","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727","zackyoray","RoeyAzran1992","viraatc","v-shobhit","yuanjingx87","uchihatmtkinu","nvrohanv","vegaluisjose","qsang-nv","ChunhuanLin","timlee0212","venkywonka","zbpatel","tijyojwad","shyeh25","zihaok","nv-yilinf","ttyio","farazkh80","yuantailing","JennyLiu-nv","moraxu","IzzyPutterman","nvchenghaoz","nvxuanyuc","poweiw","stnie","zhanga5","nzmora-nvidia","greg-kwasniewski1","linda-stadter","Tom-Zheng","vanshilshah97","ixlmar","MatthiasKohl","Wanli-Jiang", "arekay", "davidclark-nv", "2ez4bz", "tcherckez-nvidia", "MrGeva", "galagam", "limin2021", "dhansen-nvidia","talorabr","kanghui0204","wu6u3tw","hvagadia","xavier-nvidia","raayandhar","dbari","nvjullin","elvischenv","zhenhuaw-me","weireweire","yifeizhang-c","jiaganc","ziyixiong-nv","FelixXidddd","JunyiXu-nv","bo-nv","zerollzeng","RayenTian","ameynaik-hub","raymochen","shuyixiong","johncalesp","leslie-fang25","reasonsolo","zhou-yuxin","vadiklyutiy","yali-arch","NVShreyas","h-guo18","pengbowang-nv","lancelly","heyuhhh","mayani-nv","flin3500","sunnyqgg","kris1025", "karljang"]'),
         github.actor)
     steps:
       - name: Check if comment is issued by authorized person
diff --git a/README.md b/README.md
index f6625a05596..1559ee4d00c 100644
--- a/README.md
+++ b/README.md
@@ -18,10 +18,9 @@ TensorRT-LLM
 <div align="left">
 
 ## Tech Blogs
-* [08/06] Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
+* [08/05] Running a High-Performance GPT-OSS-120B Inference Server with TensorRT-LLM
 ✨ [➡️ link](./docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md)
 
-
 * [08/01] Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
 ✨ [➡️ link](./docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md)
 
@@ -44,6 +43,7 @@ TensorRT-LLM
 ✨ [➡️ link](./docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md)
 
 ## Latest News
+* [08/05] 🌟 TensorRT-LLM delivers Day-0 support for OpenAI's latest open-weights models: GPT-OSS-120B [➡️ link](https://huggingface.co/openai/gpt-oss-120b) and GPT-OSS-20B [➡️ link](https://huggingface.co/openai/gpt-oss-20b)
 * [07/15] 🌟 TensorRT-LLM delivers Day-0 support for LG AI Research's latest model, EXAONE 4.0 [➡️ link](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B)
 * [06/17] Join NVIDIA and DeepInfra for a developer meetup on June 26 ✨ [➡️ link](https://events.nvidia.com/scaletheunscalablenextgenai)
 * [05/22] Blackwell Breaks the 1,000 TPS/User Barrier With Meta’s Llama 4 Maverick
diff --git a/cpp/include/tensorrt_llm/common/logger.h b/cpp/include/tensorrt_llm/common/logger.h
index df84e226389..c8164b10e55 100644
--- a/cpp/include/tensorrt_llm/common/logger.h
+++ b/cpp/include/tensorrt_llm/common/logger.h
@@ -54,20 +54,21 @@ class Logger
 
 #if defined(_MSC_VER)
     template <typename... Args>
-    void log(Level level, char const* format, Args const&... args);
+    void log(Level const level, char const* format, Args const&... args);
 
     template <typename... Args>
-    void log(Level level, int rank, char const* format, Args const&... args);
+    void log(Level const level, int const rank, char const* format, Args const&... args);
 #else
     template <typename... Args>
-    void log(Level level, char const* format, Args const&... args) __attribute__((format(printf, 3, 0)));
+    void log(Level const level, char const* format, Args const&... args) __attribute__((format(printf, 3, 0)));
 
     template <typename... Args>
-    void log(Level level, int rank, char const* format, Args const&... args) __attribute__((format(printf, 4, 0)));
+    void log(Level const level, int const rank, char const* format, Args const&... args)
+        __attribute__((format(printf, 4, 0)));
 #endif
 
     template <typename... Args>
-    void log(Level level, std::string const& format, Args const&... args)
+    void log(Level const level, std::string const& format, Args const&... args)
     {
         return log(level, format.c_str(), args...);
     }
@@ -134,7 +135,7 @@ class Logger
 };
 
 template <typename... Args>
-void Logger::log(Logger::Level level, char const* format, Args const&... args)
+void Logger::log(Logger::Level const level, char const* format, Args const&... args)
 {
     if (isEnabled(level))
     {
diff --git a/cpp/kernels/fmha_v2/fmha_test.py b/cpp/kernels/fmha_v2/fmha_test.py
index bd743d829a0..24e12f54634 100644
--- a/cpp/kernels/fmha_v2/fmha_test.py
+++ b/cpp/kernels/fmha_v2/fmha_test.py
@@ -165,8 +165,8 @@ def test_trtllm_context_mla_attention_fmha(dtype, s):
     if dtype == "-bf16" and s == 4096:
         epsilon += ' -epsilon 0.03'
 
-    if dtype in ["-e4m3", "-e4m3 -bf16-output"] and sm_version != 120:
-        pytest.skip("FP8 MLAs are only supported on sm120 currently.")
+    if dtype in ["-e4m3", "-e4m3 -bf16-output"] and sm_version not in [90, 120]:
+        pytest.skip("FP8 MLAs are only supported on sm90 and sm120 currently.")
 
     # Context phase kernels, always use separate-q-k-v layout.
     subprocess.run(
diff --git a/cpp/kernels/fmha_v2/setup.py b/cpp/kernels/fmha_v2/setup.py
index 220b7898a98..2d8a6b416a6 100644
--- a/cpp/kernels/fmha_v2/setup.py
+++ b/cpp/kernels/fmha_v2/setup.py
@@ -1914,8 +1914,9 @@ def enable_mutex(kspec):
 
 
 def enable_tma_store(kspec):
+    output_dtype = kspec.output_dtype if kspec.output_dtype is not None else kspec.dtype
     # TMA copies data in the 16B granularity.
-    return 'true' if (kspec.dtype in ['e4m3', 'e4m3_fp32']
+    return 'true' if (output_dtype in ['e4m3', 'e4m3_fp32']
                       and kspec.head_size % 16 == 0) else 'false'
 
 
@@ -3812,7 +3813,9 @@ def enumerate_qgmma_flash_warpspec_kernels(specs,
     # use specialized kernels for cases without alibi scales.
     # there is a numeric issues when applying the exp2f scale optimization and alibi scale at the same time.
     combinations = product([False, True], \
-        [InputLayout.PACKED_QKV, InputLayout.CONTIGUOUS_Q_KV, InputLayout.Q_PAGED_KV], [False, True])
+        [InputLayout.PACKED_QKV, InputLayout.CONTIGUOUS_Q_KV,
+         InputLayout.Q_PAGED_KV, InputLayout.SEPARATE_Q_K_V],
+        [False, True])
     for (alibi, input_layout, enable_attn_logit_softcapping) in combinations:
         # alibi and bmm1_tanh_scale shouldn't be used together.
         if alibi and enable_attn_logit_softcapping:
@@ -3911,7 +3914,7 @@ def enumerate_qgmma_flash_warpspec_kernels(specs,
                 has_noloop=0,
                 noloop_step=64,
                 kv_loop_step=
-                128,  # use 64 kv step size to avoid register spilling
+                128,  # use 128 kv step size to avoid register spilling
                 kv_tile_buffers=2,  # only used by warp specialized kernels
                 unroll_threshold=1,
                 has_scale_max=False,
@@ -3926,6 +3929,46 @@ def enumerate_qgmma_flash_warpspec_kernels(specs,
                 sage_block_sizes=sage_block_sizes,
                 output_dtype=output_dtype))
 
+        # context MLA (192x128)
+        # we could use param 'output_dtype' of enumerate_qgmma_flash_warpspec_kernels(),
+        # but it will generate many unnecessary kernels and they are not easy to filter out.
+        for output_type in [None, 'bf16']:
+            specs.append(
+                kernel_spec(
+                    sm=sm,
+                    sm_mma=90,
+                    dtype=dtype,
+                    seq_len=0,  # support any sequence length
+                    head_size=192,
+                    head_size_v=128,
+                    warps_m=4,  #4x1 warpgroups
+                    warps_n=1,
+                    version=2,
+                    interleaved=False,
+                    ldgsts_q=
+                    False,  # for Hopper kernels, ldgsts = False signals TMA usage.
+                    ldgsts_k=False,
+                    ldgsts_v=False,
+                    share_smem_k_v=False,
+                    loop_step=64,
+                    q_tile_buffers=1,  # only used by warp specialized kernels
+                    has_noloop=0,
+                    noloop_step=64,
+                    kv_loop_step=128,
+                    kv_tile_buffers=2,  # only used by warp specialized kernels
+                    unroll_threshold=1,
+                    has_scale_max=False,
+                    flash_attention=True,
+                    warp_specialization=True,
+                    alibi=alibi,
+                    enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+                    return_softmax_stats=
+                    False,  # return softmax stats is not supported for fp8 now
+                    scheduling_mode=scheduling_mode,
+                    input_layout=input_layout,
+                    sage_block_sizes=sage_block_sizes,
+                    output_dtype=output_type))
+
 
 def enumerate_igmma_kernels(specs, sm=90):
     specs.append(
@@ -6377,7 +6420,7 @@ def enumerate_kernels():
                   and kspec.tiled == True)
                   # Deepseek MLA (context 192/128 separate-q-k-v)
                   or (kspec.sm            in [90, 100, 120]
-                  and kspec.dtype         in ['bf16', 'e4m3_fp32']
+                  and kspec.dtype         in ['bf16', 'e4m3', 'e4m3_fp32']
                   and kspec.head_size     == 192
                   and kspec.head_size_v   == 128
                   and kspec.input_layout == InputLayout.SEPARATE_Q_K_V
diff --git a/cpp/kernels/fmha_v2/src/fmha/hopper/gmem_tile_o_packed.h b/cpp/kernels/fmha_v2/src/fmha/hopper/gmem_tile_o_packed.h
index 75946bac612..e186ab8ad26 100644
--- a/cpp/kernels/fmha_v2/src/fmha/hopper/gmem_tile_o_packed.h
+++ b/cpp/kernels/fmha_v2/src/fmha/hopper/gmem_tile_o_packed.h
@@ -1222,6 +1222,14 @@ struct Gmem_tile_o_qgmma_fp32_16bits
     inline __device__ Gmem_tile_o_qgmma_fp32_16bits(
         Params const& params, Block_info const& block_info, Shared&&, int tidx, int cta_row_offset = 0)
         : params_o_stride_in_bytes_(params.o_stride_in_bytes)
+        , params_scale_bmm2_(
+#ifdef GENERATE_CUBIN
+              // Specialized for trt-llm generated cubins only.
+              params.scale_bmm2_d ? *params.scale_bmm2_d : params.scale_bmm2
+#else
+              params.scale_bmm2
+#endif
+              )
         , actual_seqlen_(block_info.actual_seqlen)
         , o_ptr_(reinterpret_cast<char*>(params.o_ptr))
     {
@@ -1251,21 +1259,24 @@ struct Gmem_tile_o_qgmma_fp32_16bits
     inline __device__ void store(Accumulators const (&acc)[M][N])
     {
         int64_t const step_m = 8 * params_o_stride_in_bytes_;
-        // we assume M = 1. some shortcuts.
-        static_assert(M == 1);
-
-#define STORE_COLUMN(idx)                                                                                              \
-    {                                                                                                                  \
-        float _reg0 = acc[0][mma_ni].elt(((ci + 0) * ROWS_PER_THREAD + ri) * 2 + idx);                                 \
-        float _reg1 = acc[0][mma_ni].elt(((ci + 1) * ROWS_PER_THREAD + ri) * 2 + idx);                                 \
-        static_assert(std::is_same_v<Output_type, bf16_t> || std::is_same_v<Output_type, fp16_t>);                     \
-        uint32_t _out = fmha::float2_to_16bit_2<Output_type>(_reg0, _reg1);                                            \
-        int64_t _offset = (int64_t) ri * step_m + (int64_t) (ci + mma_ni * COLS_PER_THREAD) * STEP_N;                  \
-        fmha::stg(o_ptr_ + _offset + 4 * idx, _out);                                                                   \
-    }
+#ifdef UNIFIED_EPILOGUE_SCALE
+        constexpr bool Scale = false;
+#else
+        constexpr bool Scale = true;
+#endif
 #define STORE_COLUMNS()                                                                                                \
     {                                                                                                                  \
-        STORE_COLUMN(0) STORE_COLUMN(1)                                                                                \
+        /* we assume M = 1. some shortcuts. */                                                                         \
+        static_assert(M == 1);                                                                                         \
+        uint4 _src = {                                                                                                 \
+            .x = acc[0][mma_ni].reg(((ci + 0) * ROWS_PER_THREAD + ri) * 2),                                            \
+            .y = acc[0][mma_ni].reg(((ci + 1) * ROWS_PER_THREAD + ri) * 2),                                            \
+            .z = acc[0][mma_ni].reg(((ci + 0) * ROWS_PER_THREAD + ri) * 2 + 1),                                        \
+            .w = acc[0][mma_ni].reg(((ci + 1) * ROWS_PER_THREAD + ri) * 2 + 1),                                        \
+        };                                                                                                             \
+        uint2 _dst = Acc_packer<float, Output_type, Scale>::run(this, _src);                                           \
+        int64_t _offset = (int64_t) ri * step_m + (int64_t) (ci + mma_ni * COLS_PER_THREAD) * STEP_N;                  \
+        fmha::stg(o_ptr_ + _offset, _dst);                                                                             \
     }
 
 #pragma unroll
@@ -1303,6 +1314,10 @@ struct Gmem_tile_o_qgmma_fp32_16bits
 
     // The stride between rows for the QKV matrice.
     int64_t params_o_stride_in_bytes_;
+    // Scaling factor; this usually means QKV descale factor in actuality
+    uint32_t params_scale_bmm2_;
+    // Scaling factor; this usually means QKV descale factor in actuality
+    uint32_t params_scale_bmm2_;
     // The pointer.
     char* o_ptr_;
     // The row loaded by this thread.
diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h
index c8e3c318d6f..1a8853a03ec 100644
--- a/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h
+++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h
@@ -755,7 +755,7 @@ struct DMA
             for (int kgroup_idx = 0; kgroup_idx < Kernel_traits::BMM2_K_GROUPS; kgroup_idx++)
             {
 #pragma unroll
-                for (int dgroup_idx = 0; dgroup_idx < Kernel_traits::D_GROUPS; dgroup_idx++)
+                for (int dgroup_idx = 0; dgroup_idx < Kernel_traits::DV_GROUPS; dgroup_idx++)
                 {
                     // Src smem block is k first then d
                     uint32_t src_offset = (kgroup_idx * Kernel_traits::BMM2_K_PER_GROUP * Kernel_traits::D_PER_GROUP
@@ -764,7 +764,7 @@ struct DMA
 
                     // Dst smem block is d first then k
                     uint32_t dst_offset = (dgroup_idx * Kernel_traits::BMM2_K_PER_GROUP * Kernel_traits::D_PER_GROUP
-                                              + kgroup_idx * Kernel_traits::BMM2_K_PER_GROUP * Kernel_traits::D)
+                                              + kgroup_idx * Kernel_traits::BMM2_K_PER_GROUP * Kernel_traits::DV)
                         * Kernel_traits::ELEMENT_BYTES;
 
                     transposer.template transpose_<false>(smem_v_src + src_offset, smem_v_dst + dst_offset);
diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h
index 8c93ce8a988..e12847d7659 100644
--- a/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h
+++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h
@@ -589,7 +589,8 @@ struct Kernel_traits_Hopper_qgmma_e4m3_fp32
     // Base class.
     using Base = Kernel_traits<Hopper_qgmma_e4m3_fp32_traits, STEP_Q_, STEP_KV_, D_, DV_, Q_BUFFERS_, KV_BUFFERS_,
         NUM_COMPUTE_GROUPS_, DMA2COMPUTE_DEPTH_, ATTENTION_MASK_TYPE_, HEADS_INTERLEAVED_, APPLY_ALIBI_, ENABLE_MUTEX_,
-        SCHEDULING_MODE_, INPUT_LAYOUT_, USE_TMA_STORE_, ENABLE_BMM1_SOFTCAPPING_SCALE_>;
+        SCHEDULING_MODE_, INPUT_LAYOUT_, USE_TMA_STORE_, ENABLE_BMM1_SOFTCAPPING_SCALE_, RETURN_SOFTMAX_STATS_,
+        OutputType, SAGE_BLOCK_SIZE_Q_, SAGE_BLOCK_SIZE_K_, SAGE_BLOCK_SIZE_V_>;
 
     enum
     {
diff --git a/cpp/kernels/xqa/mha_sm90.cu b/cpp/kernels/xqa/mha_sm90.cu
index 9a438df9a2a..da44fba60c4 100644
--- a/cpp/kernels/xqa/mha_sm90.cu
+++ b/cpp/kernels/xqa/mha_sm90.cu
@@ -1012,7 +1012,7 @@ CUBIN_EXPORT __global__
         if (threadIdx.x < smem.gemm1AccColMax.size)
         {
             auto const idx = threadIdx.x;
-            smem.gemm1AccColMax[idx] = mha::numeric_limits<float>::lowest();
+            smem.gemm1AccColMax[idx] = safeInitRowMax;
             smem.gemm1AccColSum[idx] = 0;
         }
         smem.gemm1WarpGrpBar.arrive_and_wait();
@@ -1949,7 +1949,7 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec,
                     uint32_t const globalRow = tileStartRow + row;
                     if (globalRow >= cacheSeqLen)
                     {
-                        acc(m, n)(i, j) = mha::numeric_limits<float>::lowest();
+                        acc(m, n)(i, j) = safeInitRowMax;
                         continue;
                     }
                     if (globalRow >= maskStartRow)
@@ -1957,7 +1957,7 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec,
                         uint32_t const maskRow = globalRow - maskStartRow;
                         if ((bit_mask >> maskRow) == 0)
                         {
-                            acc(m, n)(i, j) = mha::numeric_limits<float>::lowest();
+                            acc(m, n)(i, j) = safeInitRowMax;
                         }
                     }
                 }
@@ -2087,7 +2087,7 @@ __device__ inline void warpGrpApplyMask(uint32_t warpRank, Gemm0Acc& acc, uint32
 #pragma unroll
                 for (uint32_t j = 0; j < GmmaAccCoreMat::cols; j++)
                 {
-                    acc(m, n)(i, j) = mha::numeric_limits<float>::lowest();
+                    acc(m, n)(i, j) = safeInitRowMax;
                 }
             }
         }
@@ -2380,9 +2380,9 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec,
                 {
                     uint32_t const col = GmmaAccCoreMat::cols * (4 * n + idxInQuad) + j;
                     assert((col < nbValidCols) == bool(endMask & (1ULL << col)));
-                    if (((mask >> col) & 1) == 0)
+                    if ((mask & (1ULL << col)) == 0)
                     {
-                        acc(m, n)(i, j) = mha::numeric_limits<float>::lowest();
+                        acc(m, n)(i, j) = safeInitRowMax;
                     }
                 }
             }
@@ -2410,7 +2410,7 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, uint32_t validColBeg, uin
 #pragma unroll
                 for (uint32_t i = 0; i < GmmaAccCoreMat::rows; i++)
                 {
-                    acc(m, n)(i, j) = mha::numeric_limits<float>::lowest();
+                    acc(m, n)(i, j) = safeInitRowMax;
                 }
             }
         }
diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp
index bbe2e2fd179..866a0d71c8f 100644
--- a/cpp/tensorrt_llm/common/attentionOp.cpp
+++ b/cpp/tensorrt_llm/common/attentionOp.cpp
@@ -285,6 +285,9 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams&
     xqaParams.fp4_out_sf_scale = generationsParams.attention_output_sf_scale;
     xqaParams.start_token_idx_sf = generationsParams.start_token_idx_sf;
 
+    // Cross attention parameters.
+    xqaParams.encoder_input_lengths = generationsParams.encoder_input_lengths;
+
     return true;
 }
 
@@ -2229,6 +2232,10 @@ int AttentionOp::enqueueGeneration(EnqueueGenerationParams<T> const& params, cud
         {
             TLLM_CHECK_WITH_INFO(false, "No available kernels are found for FP4 output.");
         }
+        else
+        {
+            TLLM_LOG_DEBUG("XQA kernels are not selected in the generation phase.");
+        }
     }
 
     // This is the number of kv tokens that q needs to visit, but excluding one as it will be processed before the kv
@@ -2750,7 +2757,7 @@ int AttentionOp::initialize() noexcept
             !useCustomMask() || mEnableContextFMHA, "Only Context FMHA supports custom mask input currently.");
     }
 
-    mEnableXQA = (mEnableXQA || mIsSpecDecodingEnabled) && !mCrossAttention
+    mEnableXQA = (mEnableXQA || mIsSpecDecodingEnabled)
         && (mType == nvinfer1::DataType::kHALF || mType == nvinfer1::DataType::kBF16) && mUseKVCache;
 
     if (mEnableXQA)
diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
index 2332950629f..0ce601d5b08 100644
--- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
+++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
@@ -1524,6 +1524,11 @@ struct CollectiveMmaArrayMixedInput<
     CUTLASS_DEVICE void tensormaps_cp_fence_release(
         TensorMapStorage& shared_tensormaps, cute::tuple<TMs...> const& input_tensormaps)
     {
+        if (cute::elect_one_sync())
+        {
+            cute::tma_desc_commit_group();
+            cute::tma_desc_wait_group();
+        }
         // Entire warp must do this (i.e. it's aligned)
         tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
         tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
index fbcd279420d..0c2f3aed72b 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
@@ -260,6 +260,8 @@ extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_tma_ws_sm90
 extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
 extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
 extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
+extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
+extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
 extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
 extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
 extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
@@ -1969,6 +1971,10 @@ static const struct FusedMultiHeadAttentionKernelMetaInfoV2
 { DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 180480, 384, 64, 2, 2, false, true, true, true, false, false, true, false, nullptr},
 { DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_causal_softcapping_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90},
 { DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 3, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_tma_ws_sm90},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 3, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_tma_ws_sm90},
+{ DATA_TYPE_E4M3, DATA_TYPE_BF16, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90_kernel", 164096, 384, 64, 0, 3, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90},
+{ DATA_TYPE_E4M3, DATA_TYPE_BF16, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_causal_output_bf16_tma_ws_sm90_kernel", 164096, 384, 64, 1, 3, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90},
 { DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_causal_alibi_tma_ws_sm90_kernel", 82304, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90},
 { DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90},
 { DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90},
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h
index 97ad58335fc..fcf8ab3851f 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h
@@ -106,6 +106,9 @@ struct XQAParams
 
     void* quant_q_buffer_ptr = nullptr;
 
+    // for cross attention
+    int32_t const* encoder_input_lengths = nullptr;
+
     cudaStream_t stream = 0;
 
     std::string toString() const
@@ -175,6 +178,7 @@ struct XQAParams
            << "total_num_input_tokens :" << total_num_input_tokens << std ::endl
            << "is_fp8_output :" << (is_fp8_output ? "true" : "false") << std ::endl
            << "fp8_out_scale :" << fp8_out_scale << std ::endl
+           << "encoder_input_lengths: " << encoder_input_lengths << std::endl
            << "stream :" << stream;
 
         return ss.str();
diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
index 65e55c65fba..4442e2c2369 100644
--- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
+++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
@@ -1348,15 +1348,17 @@ __global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams<T, KVCache
     int const batch_idx = blockIdx.z;
 
     // The decoder sequence length.
-    int const decoder_seq_len = params.seq_lens[batch_idx];
+    // Spec decoding not supported for cross-attention at the moment so we can set 1 and batch_idx here
+    int const decoder_seq_len = params.generation_phase ? 1 : params.seq_lens[batch_idx];
     // The decoder sequence offset.
-    int const decoder_seq_offset = params.cu_seq_lens[batch_idx];
+    int const decoder_seq_offset = params.generation_phase ? batch_idx : params.cu_seq_lens[batch_idx];
     // The decoder cache sequence length (includes the current input).
     int const decoder_cache_seq_len = params.cache_seq_lens[batch_idx];
     // The encoder sequence length.
     int const encoder_seq_len = params.encoder_seq_lens[batch_idx];
     // The encoder sequence offset.
-    int const encoder_seq_offset = params.cu_kv_seq_lens[batch_idx];
+    // Not needed in Gen phase
+    int const encoder_seq_offset = params.generation_phase ? -1 : params.cu_kv_seq_lens[batch_idx];
     // THe maximum sequence length of encoder and decoder.
     int const max_seq_len = max(decoder_seq_len, encoder_seq_len);
 
@@ -1411,45 +1413,49 @@ __global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams<T, KVCache
             }
         }
 
-        // Encoder tokens (i.e. KV tokens).
-        if (head_idx == (kv_head_idx * params.qheads_per_kv_head) && token_idx < encoder_seq_len
-            && store_encoder_kv_cache && params.kv_cache_buffer.data != nullptr)
+        if (!params.generation_phase)
         {
-            // The global token idx in all sequences.
-            int global_token_idx = token_idx + encoder_seq_offset;
-
-            // The memory offset.
-            auto const src_k_idx = static_cast<size_t>(global_token_idx) * params.kv_hidden_size * 2 + hidden_idx_kv;
-            auto const src_v_idx
-                = static_cast<size_t>(global_token_idx) * params.kv_hidden_size * 2 + src_v_offset + hidden_idx_kv;
-
-            // Only load K,V tokens from encoder qkv input.
-            auto k = *reinterpret_cast<VecT const*>(&params.cross_kv_input[src_k_idx]);
-            auto v = *reinterpret_cast<VecT const*>(&params.cross_kv_input[src_v_idx]);
-
-            // The kv cache pointers.
-            auto k_cache_block_ptr
-                = reinterpret_cast<TCache*>(params.kv_cache_buffer.getKBlockPtr(batch_idx, token_idx));
-            auto v_cache_block_ptr
-                = reinterpret_cast<TCache*>(params.kv_cache_buffer.getVBlockPtr(batch_idx, token_idx));
-            // The vector idx in the cache block.
-            auto block_vec_idx
-                = params.kv_cache_buffer.getKVLocalIdx(token_idx, kv_head_idx, VECS_PER_HEAD, head_dim_vec_idx);
-
-            // Store K and V to the cache.
-            // INT8/FP8 kv cache.
-            if constexpr (sizeof(TCache) == 1)
-            {
-                // The element index inside the block.
-                auto block_elt_idx = block_vec_idx * ELTS_PER_VEC;
-                // Store 8bits kv cache.
-                mmha::store_8bits_vec(k_cache_block_ptr, k, block_elt_idx, scale_orig_quant);
-                mmha::store_8bits_vec(v_cache_block_ptr, v, block_elt_idx, scale_orig_quant);
-            }
-            else
+            // Encoder tokens (i.e. KV tokens).
+            if (head_idx == (kv_head_idx * params.qheads_per_kv_head) && token_idx < encoder_seq_len
+                && store_encoder_kv_cache && params.kv_cache_buffer.data != nullptr)
             {
-                reinterpret_cast<VecT*>(k_cache_block_ptr)[block_vec_idx] = k;
-                reinterpret_cast<VecT*>(v_cache_block_ptr)[block_vec_idx] = v;
+                // The global token idx in all sequences.
+                int global_token_idx = token_idx + encoder_seq_offset;
+
+                // The memory offset.
+                auto const src_k_idx
+                    = static_cast<size_t>(global_token_idx) * params.kv_hidden_size * 2 + hidden_idx_kv;
+                auto const src_v_idx
+                    = static_cast<size_t>(global_token_idx) * params.kv_hidden_size * 2 + src_v_offset + hidden_idx_kv;
+
+                // Only load K,V tokens from encoder qkv input.
+                auto k = *reinterpret_cast<VecT const*>(&params.cross_kv_input[src_k_idx]);
+                auto v = *reinterpret_cast<VecT const*>(&params.cross_kv_input[src_v_idx]);
+
+                // The kv cache pointers.
+                auto k_cache_block_ptr
+                    = reinterpret_cast<TCache*>(params.kv_cache_buffer.getKBlockPtr(batch_idx, token_idx));
+                auto v_cache_block_ptr
+                    = reinterpret_cast<TCache*>(params.kv_cache_buffer.getVBlockPtr(batch_idx, token_idx));
+                // The vector idx in the cache block.
+                auto block_vec_idx
+                    = params.kv_cache_buffer.getKVLocalIdx(token_idx, kv_head_idx, VECS_PER_HEAD, head_dim_vec_idx);
+
+                // Store K and V to the cache.
+                // INT8/FP8 kv cache.
+                if constexpr (sizeof(TCache) == 1)
+                {
+                    // The element index inside the block.
+                    auto block_elt_idx = block_vec_idx * ELTS_PER_VEC;
+                    // Store 8bits kv cache.
+                    mmha::store_8bits_vec(k_cache_block_ptr, k, block_elt_idx, scale_orig_quant);
+                    mmha::store_8bits_vec(v_cache_block_ptr, v, block_elt_idx, scale_orig_quant);
+                }
+                else
+                {
+                    reinterpret_cast<VecT*>(k_cache_block_ptr)[block_vec_idx] = k;
+                    reinterpret_cast<VecT*>(v_cache_block_ptr)[block_vec_idx] = v;
+                }
             }
         }
     }
diff --git a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp
index 34fe1780fe0..956d1919ced 100644
--- a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp
+++ b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp
@@ -16,7 +16,9 @@
 
 #include "xqaDispatcher.h"
 #include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h"
 #include "tensorrt_llm/kernels/unfusedAttentionKernels.h"
+#include <cstdint>
 
 namespace
 {
@@ -38,6 +40,87 @@ constexpr inline T roundUp(T a, T b)
 namespace tensorrt_llm::kernels
 {
 
+namespace
+{
+
+template <typename T, typename KVCacheBuffer>
+QKVPreprocessingParams<T, KVCacheBuffer> makeQKVPreprocessingParams(XQAParams const& params,
+    XQALaunchParam<KVCacheBuffer> const& launchParams, void* xqa_q_input_ptr, Data_type QDataType,
+    KvCacheDataType cache_type, int32_t batch_beam_size, KVCacheBuffer const& kv_cache_buffer,
+    int32_t const* cu_seqlens, int32_t const* cu_kv_seqlens, float const* rotary_inv_freq_buf, int multiProcessorCount)
+{
+    QKVPreprocessingParams<T, KVCacheBuffer> preprocessingParms;
+    memset(&preprocessingParms, 0, sizeof(preprocessingParms));
+    // Set parameters.
+    preprocessingParms.qkv_input = static_cast<T*>(const_cast<void*>(params.qkv));
+    preprocessingParms.q_output = static_cast<T*>(xqa_q_input_ptr);
+    preprocessingParms.kv_cache_buffer = kv_cache_buffer;
+    preprocessingParms.kv_cache_block_scales_buffer = {};
+    preprocessingParms.qkv_bias = static_cast<T const*>(params.qkv_bias);
+    // Prepare values for fmha.
+    preprocessingParms.fmha_bmm1_scale = launchParams.bmm1_scale_ptr;
+    preprocessingParms.fmha_bmm2_scale = launchParams.bmm2_scale_ptr;
+    bool const is_fp8_q_input = (QDataType == DATA_TYPE_E4M3);
+    if (params.kv_cache_quant_mode.hasFp8KvCache())
+    {
+        preprocessingParms.q_scale_quant_orig = params.kv_scale_quant_orig;
+        preprocessingParms.kv_scale_quant_orig = params.kv_scale_quant_orig;
+    }
+    if (params.is_fp8_output)
+    {
+        preprocessingParms.o_scale_orig_quant = params.fp8_out_scale;
+    }
+    // Buffers.
+    preprocessingParms.logn_scaling = params.logn_scaling_ptr;
+    preprocessingParms.seq_lens = params.spec_decoding_generation_lengths;
+    preprocessingParms.cache_seq_lens = params.sequence_lengths;
+    preprocessingParms.cu_seq_lens = cu_seqlens;
+    preprocessingParms.rotary_embedding_inv_freq = rotary_inv_freq_buf;
+    preprocessingParms.rotary_coef_cache_buffer = params.rotary_cos_sin;
+    preprocessingParms.kvScaleOrigQuant = params.kv_scale_orig_quant;
+    preprocessingParms.kv_cache_scale_factors = nullptr;
+    preprocessingParms.spec_decoding_position_offsets
+        = params.cross_attention ? nullptr : params.spec_decoding_position_offsets;
+    preprocessingParms.mrope_position_deltas = params.mrope_position_deltas;
+    // Scalar parameters.
+    preprocessingParms.batch_size = int(batch_beam_size);
+    preprocessingParms.max_input_seq_len = params.generation_input_length;
+    preprocessingParms.max_kv_seq_len = params.max_past_kv_length;
+    preprocessingParms.cyclic_kv_cache_len
+        = params.cross_attention ? params.max_past_kv_length : params.cyclic_attention_window_size;
+    preprocessingParms.sink_token_len = params.cross_attention ? 0 : params.sink_token_length;
+    preprocessingParms.token_num = params.total_num_input_tokens;
+    preprocessingParms.remove_padding = true;
+    preprocessingParms.cross_attention = params.cross_attention;
+    preprocessingParms.head_num = params.num_q_heads;
+    preprocessingParms.kv_head_num = params.num_kv_heads;
+    preprocessingParms.qheads_per_kv_head = params.num_q_heads / params.num_kv_heads;
+    preprocessingParms.size_per_head = params.head_size;
+    preprocessingParms.fmha_host_bmm1_scale = 1.0f / (sqrtf(params.head_size * 1.0f) * params.q_scaling);
+    preprocessingParms.rotary_embedding_dim = params.rotary_embedding_dim;
+    preprocessingParms.rotary_embedding_base = params.rotary_embedding_base;
+    preprocessingParms.rotary_scale_type = params.rotary_embedding_scale_type;
+    preprocessingParms.rotary_embedding_scale = params.rotary_embedding_scale;
+    preprocessingParms.rotary_embedding_max_positions = params.rotary_embedding_max_positions;
+    preprocessingParms.position_embedding_type = params.position_embedding_type;
+    preprocessingParms.position_shift_enabled = params.position_shift_enabled;
+    preprocessingParms.cache_type = cache_type;
+    preprocessingParms.separate_q_kv_output = true;
+    preprocessingParms.quantized_fp8_output = is_fp8_q_input;
+    preprocessingParms.generation_phase = true;
+    preprocessingParms.multi_processor_count = multiProcessorCount;
+    preprocessingParms.rotary_vision_start = params.rotary_vision_start;
+    preprocessingParms.rotary_vision_length = params.rotary_vision_length;
+
+    // Cross-attention only.
+
+    preprocessingParms.encoder_seq_lens = params.encoder_input_lengths;
+
+    return preprocessingParms;
+}
+
+} // namespace
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 XqaDispatcher::XqaDispatcher(XqaFixedParams fixedParams)
@@ -137,9 +220,10 @@ bool XqaDispatcher::shouldUse(XQAParams const& params)
         {
             SHOULD_NOT_USE("Fallback to MMHA as unidirectional is not supported by TRTLLM-GEN kernels.");
         }
-        if (params.cross_attention)
+        if (params.cross_attention && !params.paged_kv_cache)
         {
-            SHOULD_NOT_USE("Fallback to MMHA as cross attention is not supported by TRTLLM-GEN kernels.");
+            SHOULD_NOT_USE(
+                "Fallback to MMHA as cross attention without paged KV Cache is not supported by TRTLLM-GEN kernels.");
         }
         if (params.paged_kv_cache && params.tokens_per_block < 8)
         {
@@ -252,8 +336,8 @@ void XqaDispatcher::runImpl(XQAParams params, KVCacheBuffer const& kv_cache_buff
         decoder_params.seqQOffsets = launchParams.cu_seq_lens;
         decoder_params.seqKVOffsets = launchParams.cu_kv_seq_lens;
         decoder_params.seqQLengths = params.spec_decoding_generation_lengths;
-        decoder_params.seqKVLengths = params.sequence_lengths;
-        decoder_params.batchSize = int(batch_beam_size);
+        decoder_params.seqKVLengths = params.cross_attention ? params.encoder_input_lengths : params.sequence_lengths;
+        decoder_params.batchSize = static_cast<int>(batch_beam_size);
         decoder_params.maxQSeqLength = params.generation_input_length;
         decoder_params.numTokens = params.total_num_input_tokens;
         decoder_params.removePadding = true;
@@ -273,10 +357,12 @@ void XqaDispatcher::runImpl(XQAParams params, KVCacheBuffer const& kv_cache_buff
         float const* rotary_inv_freq_buf = params.rotary_embedding_inv_freq_cache;
         // Use the nullptr for cu_seqlens when it is not computed.
         int const* cu_seqlens{nullptr};
+        int const* cu_kv_seqlens{nullptr};
         if (decoder_params.isBuildDecoderInfoKernelNeeded())
         {
             rotary_inv_freq_buf = launchParams.rotary_inv_freq_buf;
             cu_seqlens = launchParams.cu_seq_lens;
+            cu_kv_seqlens = launchParams.cu_kv_seq_lens;
             invokeBuildDecoderInfo(decoder_params, params.stream);
             sync_check_cuda_error(params.stream);
         }
@@ -285,66 +371,10 @@ void XqaDispatcher::runImpl(XQAParams params, KVCacheBuffer const& kv_cache_buff
         // NOTE: MHA kernels should read kv cache that has already been appended with new tokens' kv cache.
         void* xqa_q_input_ptr = inputScratch;
         // The preprocessing kernel that applies RoPE and updates kv cache.
-        QKVPreprocessingParams<T, KVCacheBuffer> preprocessingParms;
-        memset(&preprocessingParms, 0, sizeof(preprocessingParms));
-        // Set parameters.
-        preprocessingParms.qkv_input = static_cast<T*>(const_cast<void*>(params.qkv));
-        preprocessingParms.q_output = static_cast<T*>(xqa_q_input_ptr);
-        preprocessingParms.kv_cache_buffer = kv_cache_buffer;
-        preprocessingParms.kv_cache_block_scales_buffer = {};
-        preprocessingParms.qkv_bias = static_cast<T const*>(params.qkv_bias);
-        // Prepare values for fmha.
-        preprocessingParms.fmha_bmm1_scale = launchParams.bmm1_scale_ptr;
-        preprocessingParms.fmha_bmm2_scale = launchParams.bmm2_scale_ptr;
-        bool const is_fp8_q_input = (mQDataType == DATA_TYPE_E4M3);
-        if (params.kv_cache_quant_mode.hasFp8KvCache())
-        {
-            preprocessingParms.q_scale_quant_orig = params.kv_scale_quant_orig;
-            preprocessingParms.kv_scale_quant_orig = params.kv_scale_quant_orig;
-        }
-        if (params.is_fp8_output)
-        {
-            preprocessingParms.o_scale_orig_quant = params.fp8_out_scale;
-        }
-        // Buffers.
-        preprocessingParms.logn_scaling = params.logn_scaling_ptr;
-        preprocessingParms.seq_lens = params.spec_decoding_generation_lengths;
-        preprocessingParms.cache_seq_lens = params.sequence_lengths;
-        preprocessingParms.cu_seq_lens = cu_seqlens;
-        preprocessingParms.rotary_embedding_inv_freq = rotary_inv_freq_buf;
-        preprocessingParms.rotary_coef_cache_buffer = params.rotary_cos_sin;
-        preprocessingParms.kvScaleOrigQuant = params.kv_scale_orig_quant;
-        preprocessingParms.kv_cache_scale_factors = nullptr;
-        preprocessingParms.spec_decoding_position_offsets = params.spec_decoding_position_offsets;
-        preprocessingParms.mrope_position_deltas = params.mrope_position_deltas;
-        // Scalar parameters.
-        preprocessingParms.batch_size = int(batch_beam_size);
-        preprocessingParms.max_input_seq_len = params.generation_input_length;
-        preprocessingParms.max_kv_seq_len = params.max_past_kv_length;
-        preprocessingParms.cyclic_kv_cache_len = params.cyclic_attention_window_size;
-        preprocessingParms.sink_token_len = params.sink_token_length;
-        preprocessingParms.token_num = params.total_num_input_tokens;
-        preprocessingParms.remove_padding = true;
-        preprocessingParms.cross_attention = false;
-        preprocessingParms.head_num = params.num_q_heads;
-        preprocessingParms.kv_head_num = params.num_kv_heads;
-        preprocessingParms.qheads_per_kv_head = params.num_q_heads / params.num_kv_heads;
-        preprocessingParms.size_per_head = params.head_size;
-        preprocessingParms.fmha_host_bmm1_scale = 1.0f / (sqrtf(params.head_size * 1.0f) * params.q_scaling);
-        preprocessingParms.rotary_embedding_dim = params.rotary_embedding_dim;
-        preprocessingParms.rotary_embedding_base = params.rotary_embedding_base;
-        preprocessingParms.rotary_scale_type = params.rotary_embedding_scale_type;
-        preprocessingParms.rotary_embedding_scale = params.rotary_embedding_scale;
-        preprocessingParms.rotary_embedding_max_positions = params.rotary_embedding_max_positions;
-        preprocessingParms.position_embedding_type = params.position_embedding_type;
-        preprocessingParms.position_shift_enabled = params.position_shift_enabled;
-        preprocessingParms.cache_type = cache_type;
-        preprocessingParms.separate_q_kv_output = true;
-        preprocessingParms.quantized_fp8_output = is_fp8_q_input;
-        preprocessingParms.generation_phase = true;
-        preprocessingParms.multi_processor_count = mMultiProcessorCount;
-        preprocessingParms.rotary_vision_start = params.rotary_vision_start;
-        preprocessingParms.rotary_vision_length = params.rotary_vision_length;
+
+        auto preprocessingParms = makeQKVPreprocessingParams<T, KVCacheBuffer>(params, launchParams, xqa_q_input_ptr,
+            mQDataType, cache_type, batch_beam_size, kv_cache_buffer, cu_seqlens, cu_kv_seqlens, rotary_inv_freq_buf,
+            mMultiProcessorCount);
 
         invokeQKVPreprocessing<T, KVCacheBuffer>(preprocessingParms, params.stream);
         sync_check_cuda_error(params.stream);
@@ -394,7 +424,7 @@ void XqaDispatcher::runImpl(XQAParams params, KVCacheBuffer const& kv_cache_buff
             = reinterpret_cast<float const*>(launchParams.bmm1_scale_ptr + kIdxScaleSoftmaxLog2Ptr);
         tllmRunnerParams.oSfScalePtr = params.fp4_out_sf_scale;
         // The sequence lengths for K/V.
-        tllmRunnerParams.seqLensKvPtr = params.sequence_lengths;
+        tllmRunnerParams.seqLensKvPtr = params.cross_attention ? params.encoder_input_lengths : params.sequence_lengths;
 
         tllmRunnerParams.oPtr = params.output;
         tllmRunnerParams.oSfPtr = params.output_sf;
diff --git a/cpp/tests/unit_tests/executor/transferAgentTest.cpp b/cpp/tests/unit_tests/executor/transferAgentTest.cpp
index c73d9a2140b..4745e8e40b1 100644
--- a/cpp/tests/unit_tests/executor/transferAgentTest.cpp
+++ b/cpp/tests/unit_tests/executor/transferAgentTest.cpp
@@ -255,7 +255,8 @@ TEST_F(TransferAgentTest, SyncMessage)
         checked = nixlAgent0->checkRemoteDescs(agent1, regMem3.getDescs());
     } while (!checked);
     auto syncMessage = std::string("agent_sync_message");
-    TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem3.getDescs(), agent1, syncMessage};
+    nixlAgent0->notifySyncMessage(agent1, syncMessage);
+    TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem3.getDescs(), agent1};
     auto status = nixlAgent0->submitTransferRequests(writeReq);
 
     auto notif = nixlAgent1->getNotifiedSyncMessages();
@@ -302,7 +303,8 @@ TEST_F(TransferAgentTest, SyncMessage)
     } while (!checked2);
 
     std::string syncMessage4 = "four_agent_sync_message";
-    TransferRequest writeReq1{TransferOp::kWRITE, regMem2.getDescs(), regMem1.getDescs(), agent0, syncMessage4};
+    nixlAgent1->notifySyncMessage(agent0, syncMessage4);
+    TransferRequest writeReq1{TransferOp::kWRITE, regMem2.getDescs(), regMem1.getDescs(), agent0};
     auto status1 = nixlAgent1->submitTransferRequests(writeReq1);
     auto notif4 = nixlAgent0->getNotifiedSyncMessages();
     for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif4.size() == 0; i++)
diff --git a/docker/common/install_nixl.sh b/docker/common/install_nixl.sh
index 18ee554f693..cecd61a7af4 100644
--- a/docker/common/install_nixl.sh
+++ b/docker/common/install_nixl.sh
@@ -4,8 +4,9 @@ set -ex
 GITHUB_URL="https://github.com"
 UCX_INSTALL_PATH="/usr/local/ucx/"
 CUDA_PATH="/usr/local/cuda"
-NIXL_VERSION="0.3.1"
+NIXL_VERSION="0.5.0"
 NIXL_REPO="https://github.com/ai-dynamo/nixl.git"
+OLD_LD_LIBRARY_PATH=$LD_LIBRARY_PATH
 
 ARCH_NAME="x86_64-linux-gnu"
 GDS_PATH="$CUDA_PATH/targets/x86_64-linux"
@@ -18,25 +19,26 @@ pip3 install --no-cache-dir meson ninja pybind11
 git clone --depth 1 -b ${NIXL_VERSION} ${NIXL_REPO}
 cd nixl
 
-cuda_path=$(find / -name "libcuda.so.1" 2>/dev/null | head -n1)
-if [[ -z "$cuda_path" ]]; then
-    echo "libcuda.so.1 not found "
+CUDA_SO_PATH=$(find "/usr/local" -name "libcuda.so.1" 2>/dev/null | head -n1)
+
+if [[ -z "$CUDA_SO_PATH" ]]; then
+    echo "libcuda.so.1 not found"
     exit 1
 fi
 
-ln -sf $cuda_path $CUDA_PATH/lib64/libcuda.so.1
+CUDA_SO_PATH=$(dirname $CUDA_SO_PATH)
 
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_SO_PATH
 meson setup builddir \
     -Ducx_path=$UCX_INSTALL_PATH \
     -Dcudapath_lib="$CUDA_PATH/lib64" \
     -Dcudapath_inc="$CUDA_PATH/include" \
     -Dgds_path="$GDS_PATH" \
-    -Dinstall_headers=true \
-    -Dstatic_plugins=UCX
+    -Dinstall_headers=true
 
 cd builddir && ninja install
 cd ../..
 rm -rf nixl*  # Remove NIXL source tree to save space
-rm  $CUDA_PATH/lib64/libcuda.so.1
+export LD_LIBRARY_PATH=$OLD_LD_LIBRARY_PATH
 
 echo "export LD_LIBRARY_PATH=/opt/nvidia/nvda_nixl/lib/${ARCH_NAME}:/opt/nvidia/nvda_nixl/lib64:\$LD_LIBRARY_PATH" >> "${ENV}"
diff --git a/docker/common/install_ucx.sh b/docker/common/install_ucx.sh
index 22f444d9746..ba35e82ce63 100644
--- a/docker/common/install_ucx.sh
+++ b/docker/common/install_ucx.sh
@@ -2,29 +2,28 @@
 set -ex
 
 GITHUB_URL="https://github.com"
-UCX_VERSION="v1.18.1"
+UCX_VERSION="v1.19.x"
 UCX_INSTALL_PATH="/usr/local/ucx/"
 CUDA_PATH="/usr/local/cuda"
 UCX_REPO="https://github.com/openucx/ucx.git"
 
-if [ ! -d ${UCX_INSTALL_PATH} ]; then
-  git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO}
-  cd ucx
-  ./autogen.sh
-  ./contrib/configure-release       \
-    --prefix=${UCX_INSTALL_PATH}    \
-    --enable-shared                 \
-    --disable-static                \
-    --disable-doxygen-doc           \
-    --enable-optimizations          \
-    --enable-cma                    \
-    --enable-devel-headers          \
-    --with-cuda=${CUDA_PATH}        \
-    --with-verbs                    \
-    --with-dm                       \
-    --enable-mt
-  make install -j$(nproc)
-  cd ..
-  rm -rf ucx  # Remove UCX source to save space
-  echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}"
-fi
+rm -rf ${UCX_INSTALL_PATH}
+git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO}
+cd ucx
+./autogen.sh
+./contrib/configure-release       \
+  --prefix=${UCX_INSTALL_PATH}    \
+  --enable-shared                 \
+  --disable-static                \
+  --disable-doxygen-doc           \
+  --enable-optimizations          \
+  --enable-cma                    \
+  --enable-devel-headers          \
+  --with-cuda=${CUDA_PATH}        \
+  --with-verbs                    \
+  --with-dm                       \
+  --enable-mt
+make install -j$(nproc)
+cd ..
+rm -rf ucx  # Remove UCX source to save space
+echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}"
diff --git a/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md b/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md
index 8f5c1dfec0f..87432173b42 100644
--- a/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md
+++ b/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md
@@ -19,11 +19,11 @@ We have a forthcoming guide for achieving great performance on H100; however, th
 
 In this section, we introduce several ways to install TensorRT-LLM.
 
-###  NGC Docker Image of dev branch
+###  NGC Docker Image
 
-Day-0 support for gpt-oss is provided via the NGC container image `nvcr.io/nvidia/tensorrt-llm/release:gpt-oss-dev`. This image was built on top of the pre-day-0 **dev branch**. This container is multi-platform and will run on both x64 and arm64 architectures.
+Visit the [NGC TensorRT-LLM Release page](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release) to find the most up-to-date NGC container image to use. You can also check the latest [release notes](https://github.com/NVIDIA/TensorRT-LLM/releases) to keep track of the support status of the latest releases.
 
-Run the following docker command to start the TensorRT-LLM container in interactive mode:
+Run the following Docker command to start the TensorRT-LLM container in interactive mode (change the image tag to match latest release):
 
 ```bash
 docker run --rm --ipc=host -it \
@@ -33,7 +33,7 @@ docker run --rm --ipc=host -it \
   -p 8000:8000 \
   -e TRTLLM_ENABLE_PDL=1 \
   -v ~/.cache:/root/.cache:rw \
-  nvcr.io/nvidia/tensorrt-llm/release:gpt-oss-dev \
+  nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc0 \
   /bin/bash
 ```
 
@@ -53,9 +53,9 @@ Additionally, the container mounts your user `.cache` directory to save the down
 Support for gpt-oss has been [merged](https://github.com/NVIDIA/TensorRT-LLM/pull/6645) into the **main branch** of TensorRT-LLM. As we continue to optimize gpt-oss performance, you can build TensorRT-LLM from source to get the latest features and support. Please refer to the [doc](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html) if you want to build from source yourself.
 
 
-### Regular Release of TensorRT-LLM
+### TensorRT-LLM Python Wheel Install
 
-Since gpt-oss has been supported on the main branch, you can get TensorRT-LLM out of the box through its regular release in the future. Please check the latest [release notes](https://github.com/NVIDIA/TensorRT-LLM/releases) to keep track of the support status. The release is provided as [NGC Container Image](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags) or [pip Python wheel](https://pypi.org/project/tensorrt-llm/#history). You can find instructions on pip install [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html).
+Regular releases of TensorRT-LLM are also provided as [Python wheels](https://pypi.org/project/tensorrt-llm/#history). You can find instructions on the pip install [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html).
 
 
 ## Performance Benchmarking and Model Serving
@@ -210,7 +210,10 @@ We can use `trtllm-serve` to serve the model by translating the benchmark comman
 
 ```bash
 trtllm-serve \
-  gpt-oss-120b \  # Or ${local_model_path}
+Note: You can also point to a local path containing the model weights instead of the HF repo (e.g., `${local_model_path}`).
+
+trtllm-serve \
+  openai/gpt-oss-120b \
   --host 0.0.0.0 \
   --port 8000 \
   --backend pytorch \
@@ -228,7 +231,8 @@ For max-throughput configuration, run:
 
 ```bash
 trtllm-serve \
-  gpt-oss-120b \  # Or ${local_model_path}
+trtllm-serve \
+  openai/gpt-oss-120b \
   --host 0.0.0.0 \
   --port 8000 \
   --backend pytorch \
@@ -262,7 +266,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '
     "messages": [
         {
             "role": "user",
-            "content": "What is NVIDIA's advantage for inference?"
+            "content": "What is NVIDIAs advantage for inference?"
         }
     ],
     "max_tokens": 1024,
@@ -348,12 +352,7 @@ others according to your needs.
 
 ## (H200/H100 Only) Using OpenAI Triton Kernels for MoE
 
-OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM can leverage these kernels for Hopper-based GPUs like NVIDIA's H200 for optimal performance. `TRTLLM` MoE backend is not supported on Hopper, and `CUTLASS` backend support is still ongoing.  Please enable `TRITON` backend with the steps below if you are running on Hopper GPUs.
-
-### Installing OpenAI Triton
-
-The `nvcr.io/nvidia/tensorrt-llm/release:gpt-oss-dev` has prepared Triton already (`echo $TRITON_ROOT` could reveal the path). In other situations, you will need to build and install a specific version of Triton. Please follow the instructions in this [link](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/gpt_oss#using-openai-triton-kernels-for-moe).
-
+OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM can leverage these kernels for Hopper-based GPUs like NVIDIA's H200 for optimal performance. `TRTLLM` MoE backend is not supported on Hopper, and `CUTLASS` backend support is still ongoing. Please follow the instructions in this [link](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/gpt_oss#using-openai-triton-kernels-for-moe) to install and enable the `TRITON` MoE kernels on Hopper GPUs.
 
 ### Selecting Triton as the MoE backend
 
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 61ed4978d96..f43d454ac8a 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -1835,6 +1835,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
         "H100_PCIe-TensorRT-Post-Merge-4": ["h100-cr", "l0_h100", 4, 5],
         "H100_PCIe-TensorRT-Post-Merge-5": ["h100-cr", "l0_h100", 5, 5],
         "B200_PCIe-Triton-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1],
+        "B200_PCIe-PyTorch-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1],
         "H100_PCIe-TensorRT-Perf-1": ["h100-cr", "l0_perf", 1, 1],
         "H100_PCIe-PyTorch-Perf-1": ["h100-cr", "l0_perf", 1, 1],
         "DGX_H200-8_GPUs-PyTorch-Post-Merge-1": ["dgx-h200-x8", "l0_dgx_h200", 1, 1, 8],
diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties
index 751f2516358..bd46241e51d 100644
--- a/jenkins/current_image_tags.properties
+++ b/jenkins/current_image_tags.properties
@@ -11,7 +11,7 @@
 #
 # NB: Typically, the suffix indicates the PR whose CI pipeline generated the images. In case that
 #     images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
-LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508130930-6501
-LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508130930-6501
-LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202508130930-6501
-LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202508130930-6501
+LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
+LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
+LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
+LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
diff --git a/requirements.txt b/requirements.txt
index e2582f50385..a7821f15db6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cu128
 -c constraints.txt
-accelerate>=0.25.0
+accelerate>=1.7.0
 build
 colored
 cuda-python>=12,<13
diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py
index e40543c78f3..d7cd4c61f1a 100755
--- a/scripts/build_wheel.py
+++ b/scripts/build_wheel.py
@@ -329,10 +329,8 @@ def create_cuda_stub_links(cuda_stub_dir: str, missing_libs: list[str]) -> str:
     return str(temp_dir_path)
 
 
-def check_missing_libs(so_prefix: str) -> list[str]:
-    result = build_run(f"ldd {so_prefix}.cpython*.so",
-                       capture_output=True,
-                       text=True)
+def check_missing_libs(lib_name: str) -> list[str]:
+    result = build_run(f"ldd {lib_name}", capture_output=True, text=True)
     missing = []
     for line in result.stdout.splitlines():
         if "not found" in line:
@@ -344,7 +342,7 @@ def check_missing_libs(so_prefix: str) -> list[str]:
 
 
 def generate_python_stubs_linux(binding_type: str, venv_python: Path,
-                                deep_ep: bool):
+                                deep_ep: bool, binding_lib_name: str):
     is_nanobind = binding_type == "nanobind"
     if is_nanobind:
         build_run(f"\"{venv_python}\" -m pip install nanobind")
@@ -353,7 +351,7 @@ def generate_python_stubs_linux(binding_type: str, venv_python: Path,
     env_stub_gen = os.environ.copy()
     cuda_home_dir = env_stub_gen.get("CUDA_HOME") or env_stub_gen.get(
         "CUDA_PATH") or "/usr/local/cuda"
-    missing_libs = check_missing_libs("bindings")
+    missing_libs = check_missing_libs(binding_lib_name)
     cuda_stub_dir = f"{cuda_home_dir}/lib64/stubs"
 
     if missing_libs and Path(cuda_stub_dir).exists():
@@ -806,7 +804,9 @@ def get_binding_lib(subdirectory, name):
             ) == 1, f"Exactly one binding library should be present: {binding_lib}"
             return binding_lib[0]
 
-        install_file(get_binding_lib(binding_type, "bindings"), pkg_dir)
+        binding_lib_dir = get_binding_lib(binding_type, "bindings")
+        binding_lib_file_name = binding_lib_dir.name
+        install_file(binding_lib_dir, pkg_dir)
 
         with (build_dir / "tensorrt_llm" / "deep_ep" /
               "cuda_architectures.txt").open() as f:
@@ -846,7 +846,7 @@ def get_binding_lib(subdirectory, name):
                 else:  # on linux
                     generate_python_stubs_linux(
                         binding_type, venv_python,
-                        bool(deep_ep_cuda_architectures))
+                        bool(deep_ep_cuda_architectures), binding_lib_file_name)
 
     if not skip_building_wheel:
         if dist_dir is None:
diff --git a/tensorrt_llm/_torch/attention_backend/__init__.py b/tensorrt_llm/_torch/attention_backend/__init__.py
index c426b18f0a5..68805efa4a2 100644
--- a/tensorrt_llm/_torch/attention_backend/__init__.py
+++ b/tensorrt_llm/_torch/attention_backend/__init__.py
@@ -1,4 +1,4 @@
-from ..custom_ops import IS_FLASHINFER_AVAILABLE
+from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE
 from .interface import AttentionBackend, AttentionMetadata
 from .trtllm import AttentionInputType, TrtllmAttention, TrtllmAttentionMetadata
 from .vanilla import VanillaAttention, VanillaAttentionMetadata
diff --git a/tensorrt_llm/_torch/attention_backend/utils.py b/tensorrt_llm/_torch/attention_backend/utils.py
index 911621f150b..b741ec37c73 100644
--- a/tensorrt_llm/_torch/attention_backend/utils.py
+++ b/tensorrt_llm/_torch/attention_backend/utils.py
@@ -1,7 +1,7 @@
 from typing import Optional, Type
 
 from ...models.modeling_utils import QuantConfig
-from . import IS_FLASHINFER_AVAILABLE
+from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE
 from .interface import AttentionBackend, MLAParams, PositionalEmbeddingParams
 from .trtllm import TrtllmAttention
 from .vanilla import VanillaAttention
diff --git a/tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py b/tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py
index 00b535dec61..e0e21b1d70e 100644
--- a/tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py
+++ b/tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py
@@ -43,11 +43,13 @@ def _patch_unsupported_input_tensor():
     """
     original_fn = lowering.unsupported_input_tensor
 
-    def patched_fn(t: torch.Tensor, parent=None, node=None):
+    def patched_fn(t: torch.Tensor, *args, **kwargs):
         """Bypass meta tensor check."""
         if t.is_meta:
             return False
-        return original_fn(t, parent, node)
+        return original_fn(
+            t, *args, **kwargs
+        )  # a generic pass-through of the arguments to accommodate torch side change
 
     lowering.unsupported_input_tensor = patched_fn
     try:
diff --git a/tensorrt_llm/_torch/custom_ops/__init__.py b/tensorrt_llm/_torch/custom_ops/__init__.py
index 8a81d1123a4..f8e46d3d29a 100644
--- a/tensorrt_llm/_torch/custom_ops/__init__.py
+++ b/tensorrt_llm/_torch/custom_ops/__init__.py
@@ -1,5 +1,6 @@
+from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE
+from ..modules.attention import attn_custom_op_inplace, mla_custom_op_inplace
 from .cpp_custom_ops import _register_fake
-from .flashinfer_custom_ops import IS_FLASHINFER_AVAILABLE
 from .torch_custom_ops import bmm_out
 from .trtllm_gen_custom_ops import fp8_block_scale_moe_runner
 from .userbuffers_custom_ops import add_to_ub, copy_to_userbuffers, matmul_to_ub
@@ -12,6 +13,8 @@
     'add_to_ub',
     'copy_to_userbuffers',
     'matmul_to_ub',
+    'attn_custom_op_inplace',
+    'mla_custom_op_inplace',
 ]
 
 if IS_FLASHINFER_AVAILABLE:
diff --git a/tensorrt_llm/_torch/custom_ops/flashinfer_custom_ops.py b/tensorrt_llm/_torch/custom_ops/flashinfer_custom_ops.py
index 15e4b3750f2..223c9bd5b04 100644
--- a/tensorrt_llm/_torch/custom_ops/flashinfer_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/flashinfer_custom_ops.py
@@ -1,35 +1,11 @@
-import os
-import platform
-import traceback
-
 import torch
 
-from ...logger import logger
-
-IS_FLASHINFER_AVAILABLE = False
-
-
-def get_env_enable_pdl():
-    return os.environ.get("TRTLLM_ENABLE_PDL", "0") == "1"
-
-
-ENABLE_PDL = get_env_enable_pdl()
-if ENABLE_PDL:
-    logger.info("PDL is enabled")
-
-if platform.system() != "Windows":
-    try:
-        import flashinfer
-        IS_FLASHINFER_AVAILABLE = True
-    except ImportError:
-        traceback.print_exc()
-        print(
-            "flashinfer is not installed properly, please try pip install or building from source codes"
-        )
+from ..flashinfer_utils import ENABLE_PDL, IS_FLASHINFER_AVAILABLE
 
 if IS_FLASHINFER_AVAILABLE:
     from flashinfer.activation import silu_and_mul
     from flashinfer.norm import fused_add_rmsnorm, rmsnorm
+    from flashinfer.rope import apply_rope_with_cos_sin_cache_inplace
 
     # Warp this into custom op since flashinfer didn't warp it properly and we want to avoid graph break between mlp layer for user buffer optimization
     @torch.library.custom_op("trtllm::flashinfer_silu_and_mul", mutates_args=())
@@ -69,7 +45,7 @@ def flashinfer_apply_rope_with_cos_sin_cache_inplace(
         cos_sin_cache: torch.Tensor,
         is_neox: bool = True,
     ) -> None:
-        flashinfer.apply_rope_with_cos_sin_cache_inplace(
+        apply_rope_with_cos_sin_cache_inplace(
             positions,
             query,
             key,
diff --git a/tensorrt_llm/_torch/flashinfer_utils.py b/tensorrt_llm/_torch/flashinfer_utils.py
new file mode 100644
index 00000000000..5b150665b5b
--- /dev/null
+++ b/tensorrt_llm/_torch/flashinfer_utils.py
@@ -0,0 +1,27 @@
+import os
+import platform
+import traceback
+
+from ..logger import logger
+
+IS_FLASHINFER_AVAILABLE = False
+
+
+def get_env_enable_pdl():
+    return os.environ.get("TRTLLM_ENABLE_PDL", "0") == "1"
+
+
+ENABLE_PDL = get_env_enable_pdl()
+if ENABLE_PDL:
+    logger.info("PDL is enabled")
+
+if platform.system() != "Windows":
+    try:
+        import flashinfer
+        logger.info(f"flashinfer is available: {flashinfer.__version__}")
+        IS_FLASHINFER_AVAILABLE = True
+    except ImportError:
+        traceback.print_exc()
+        print(
+            "flashinfer is not installed properly, please try pip install or building from source codes"
+        )
diff --git a/tensorrt_llm/_torch/modules/rms_norm.py b/tensorrt_llm/_torch/modules/rms_norm.py
index 39787b82b7a..2a22d858250 100644
--- a/tensorrt_llm/_torch/modules/rms_norm.py
+++ b/tensorrt_llm/_torch/modules/rms_norm.py
@@ -19,7 +19,7 @@
 import torch
 from torch import nn
 
-from ..custom_ops import IS_FLASHINFER_AVAILABLE
+from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE
 
 
 class RMSNorm(nn.Module):
diff --git a/tensorrt_llm/_torch/modules/rotary_embedding.py b/tensorrt_llm/_torch/modules/rotary_embedding.py
index 0b0d5582687..35bf7759d2e 100644
--- a/tensorrt_llm/_torch/modules/rotary_embedding.py
+++ b/tensorrt_llm/_torch/modules/rotary_embedding.py
@@ -3,8 +3,8 @@
 import torch
 from torch import nn
 
-from ..attention_backend import IS_FLASHINFER_AVAILABLE
 from ..attention_backend.interface import RopeParams
+from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE
 
 
 class RotaryEmbedding(nn.Module):
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 4cebfae58b1..a40b9b90459 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -1181,6 +1181,17 @@ def _forward_step_inter_pp(self, scheduled_batch) -> SampleState:
 
     def _validate_request(self, request: LlmRequest):
         if isinstance(self.model_engine.model, DecoderModelForCausalLM):
+            # Only skip token‐range checks for Llama4 when the request has multimodal data
+            from ..models.modeling_llama import Llama4ForConditionalGeneration
+            if isinstance(self.model_engine.model,
+                          Llama4ForConditionalGeneration):
+                has_mm = bool(request.py_multimodal_data)
+                if has_mm:
+                    logger.debug(
+                        f"Skipping token-range validation for {type(self.model_engine.model).__name__} "
+                        "(multimodal request)")
+                    return
+
             # FIXME: This check is necessary because of how Qwen2ForProcessRewardModel
             #        subclasses DecoderModelForCausalLM. Perhaps the functionality
             #        of DecoderModelForCausalLM reused by Qwen2ForProcessRewardModel
diff --git a/tensorrt_llm/commands/eval.py b/tensorrt_llm/commands/eval.py
index eff80d1a69d..32f4a420864 100644
--- a/tensorrt_llm/commands/eval.py
+++ b/tensorrt_llm/commands/eval.py
@@ -20,8 +20,8 @@
 
 from .. import LLM as PyTorchLLM
 from .._tensorrt_engine import LLM
-from ..evaluate import (GSM8K, MMLU, CnnDailymail, GPQADiamond, GPQAExtended,
-                        GPQAMain, JsonModeEval)
+from ..evaluate import (GSM8K, MMLU, MMMU, CnnDailymail, GPQADiamond,
+                        GPQAExtended, GPQAMain, JsonModeEval)
 from ..llmapi import BuildConfig, KvCacheConfig
 from ..llmapi.llm_utils import update_llm_args_with_extra_options
 from ..logger import logger, severity_map
@@ -152,6 +152,7 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str,
 main.add_command(GPQAMain.command)
 main.add_command(GPQAExtended.command)
 main.add_command(JsonModeEval.command)
+main.add_command(MMMU.command)
 
 if __name__ == "__main__":
     main()
diff --git a/tensorrt_llm/evaluate/__init__.py b/tensorrt_llm/evaluate/__init__.py
index 58b789ecbef..89d87425338 100755
--- a/tensorrt_llm/evaluate/__init__.py
+++ b/tensorrt_llm/evaluate/__init__.py
@@ -15,10 +15,10 @@
 
 from .cnn_dailymail import CnnDailymail
 from .json_mode_eval import JsonModeEval
-from .lm_eval import GSM8K, GPQADiamond, GPQAExtended, GPQAMain
+from .lm_eval import GSM8K, MMMU, GPQADiamond, GPQAExtended, GPQAMain
 from .mmlu import MMLU
 
 __all__ = [
     "CnnDailymail", "MMLU", "GSM8K", "GPQADiamond", "GPQAMain", "GPQAExtended",
-    "JsonModeEval"
+    "JsonModeEval", "MMMU"
 ]
diff --git a/tensorrt_llm/evaluate/lm_eval.py b/tensorrt_llm/evaluate/lm_eval.py
index 920299b1030..c77e3be4479 100644
--- a/tensorrt_llm/evaluate/lm_eval.py
+++ b/tensorrt_llm/evaluate/lm_eval.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
+import json
 import os
 from contextlib import contextmanager
 from typing import Dict, Iterable, List, Optional, Tuple, Union
@@ -22,6 +23,7 @@
 from tqdm import tqdm
 
 import tensorrt_llm.profiler as profiler
+from tensorrt_llm.inputs import prompt_inputs
 
 try:
     from lm_eval.api.model import TemplateLM
@@ -31,11 +33,18 @@
 
 from .. import LLM as PyTorchLLM
 from .._tensorrt_engine import LLM
+from ..inputs import (ConversationMessage, MultimodalDataTracker,
+                      add_multimodal_placeholders, convert_image_mode)
+from ..inputs.utils import apply_chat_template as trtllm_apply_chat_template
 from ..llmapi import RequestOutput
 from ..logger import logger
 from ..sampling_params import SamplingParams
 from .interface import Evaluator
 
+# NOTE: lm_eval uses "<image>" as the default image placeholder
+# https://github.com/EleutherAI/lm-evaluation-harness/blob/7f04db12d2f8e7a99a0830d99eb78130e1ba2122/lm_eval/models/hf_vlms.py#L25
+LM_EVAL_DEFAULT_IMAGE_PLACEHOLDER = "<image>"
+
 
 class LmEvalWrapper(TemplateLM):
 
@@ -125,6 +134,163 @@ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
         return [output.outputs[0].text for output in outputs]
 
 
+class MultimodalLmEvalWrapper(LmEvalWrapper):
+    """
+    Multimodal wrapper for lm-evaluation-harness that handles vision-language models.
+
+    This wrapper extends the base LmEvalWrapper to support multimodal inputs,
+    particularly for tasks that require both text and image processing.
+    """
+
+    def __init__(self,
+                 llm: Union[LLM, PyTorchLLM],
+                 sampling_params: Optional[SamplingParams] = None,
+                 streaming: bool = False,
+                 max_images: int = 999):
+        """
+        Initialize the multimodal wrapper.
+
+        Args:
+            llm: The language model instance (either TensorRT or PyTorch)
+            sampling_params: Parameters for text generation
+            streaming: Whether to use streaming generation
+            max_images: Maximum number of images per prompt (currently unlimited in TRT-LLM), set to 999 from lm_eval's default value.
+        """
+        super().__init__(llm, sampling_params, streaming)
+
+        # NOTE: Required by lm_eval to identify this as a multimodal model
+        self.MULTIMODAL = True
+        self.max_images = max_images
+        self.model_type = self._get_model_type(llm)
+
+        # NOTE: In TRT-LLM, currently we do not support interleaved text and image. Instead, we are adding image placeholders at the end of the text or at the beginning of the text.
+        # So, until we support interleaved text and image, we set this to False.
+        self.interleave = False
+
+    def _get_model_type(self, llm: Union[LLM, PyTorchLLM]) -> str:
+        """Extract model type from the model configuration."""
+        config_path = os.path.join(llm._hf_model_dir, 'config.json')
+
+        if not os.path.exists(config_path):
+            raise FileNotFoundError(
+                f"Model configuration file not found: {config_path}")
+
+        try:
+            with open(config_path, 'r') as f:
+                config = json.load(f)
+        except json.JSONDecodeError as e:
+            raise ValueError(
+                f"Invalid JSON in model configuration file {config_path}: {e}")
+
+        if 'model_type' not in config:
+            raise KeyError(
+                f"'model_type' key not found in model configuration: {config_path}"
+            )
+
+        return config['model_type']
+
+    def apply_chat_template(self,
+                            chat_history: List[Dict[str, str]],
+                            add_generation_prompt: bool = True) -> str:
+        """
+        Apply chat template to multimodal conversation history.
+
+        Converts text with image placeholders into structured format expected by
+        the multimodal processor.
+
+        Adapted from: https://github.com/EleutherAI/lm-evaluation-harness/blob/7f04db12d2f8e7a99a0830d99eb78130e1ba2122/lm_eval/models/hf_vlms.py#L225
+        """
+        mm_placeholder_counts = []
+        for i in range(len(chat_history)):
+            content = chat_history[i]
+            text = content["content"]
+            image_count = min(self.max_images,
+                              text.count(LM_EVAL_DEFAULT_IMAGE_PLACEHOLDER))
+
+            if self.interleave:
+                # TODO: Implement interleaved text and image.
+                text.split(LM_EVAL_DEFAULT_IMAGE_PLACEHOLDER)
+                ...
+            else:
+                text = text.replace(LM_EVAL_DEFAULT_IMAGE_PLACEHOLDER, "")
+
+            conv = ConversationMessage(role="user", content=text)
+            mm_data_tracker = MultimodalDataTracker(self.model_type)
+
+            # NOTE: Since we already have loaded images, for the placeholder purpose, we add data here.
+            for _ in range(image_count):
+                mm_data_tracker.add_data("image", None)
+            mm_placeholder_count = mm_data_tracker.placeholder_counts()
+            if mm_placeholder_count:
+                # TODO: This is an assumption of not interleaving text and image. Need to extend to interleaved texts.
+                conv["content"] = add_multimodal_placeholders(
+                    self.model_type, conv["content"], mm_placeholder_count)
+            mm_placeholder_counts.append(mm_placeholder_count)
+            chat_history[i] = conv
+
+        output = trtllm_apply_chat_template(
+            model_type=self.model_type,
+            tokenizer=self.llm.tokenizer,
+            processor=self.llm.input_processor.processor,
+            conversation=chat_history,
+            add_generation_prompt=add_generation_prompt,
+            mm_placeholder_counts=mm_placeholder_counts,
+            tools=None,
+            chat_template_kwargs={
+                "continue_final_message": not add_generation_prompt
+            })
+        return output
+
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+        """
+        Generate text responses for multimodal requests.
+
+        This method processes multimodal requests that include both text prompts
+        and visual data (images).
+
+        Args:
+            requests: List of multimodal generation requests
+            disable_tqdm: Whether to disable progress bars
+
+        Returns:
+            List of generated text responses
+        """
+        profiler.start("trtllm exec")
+        results = []
+        for request in tqdm(requests,
+                            desc="Submitting requests",
+                            disable=disable_tqdm):
+
+            # NOTE: For now, only this part is different from the original generate_until
+            prompt, gen_kwargs, media_data = request.args
+            prompt = prompt_inputs(prompt)
+
+            # NOTE: Convert RGBA format to RGB format
+            images = [
+                convert_image_mode(img, "RGB") for img in media_data["visual"]
+            ]
+            prompt["multi_modal_data"] = {"image": images}
+
+            sampling_params = self._get_sampling_params(gen_kwargs)
+            output = self.llm.generate_async(prompt,
+                                             sampling_params=sampling_params,
+                                             streaming=self.streaming)
+            results.append(output)
+
+        outputs = []
+        for output in tqdm(results,
+                           desc="Fetching responses",
+                           disable=disable_tqdm):
+            outputs.append(output.result())
+
+        profiler.stop("trtllm exec")
+        elapsed_time = profiler.elapsed_time_in_sec("trtllm exec")
+        logger.info(f"TRTLLM execution time: {elapsed_time:.3f} seconds.")
+        profiler.reset("trtllm exec")
+
+        return [output.outputs[0].text for output in outputs]
+
+
 class LmEvalEvaluator(Evaluator):
 
     def __init__(self,
@@ -134,7 +300,8 @@ def __init__(self,
                  random_seed: int = 0,
                  apply_chat_template: bool = False,
                  fewshot_as_multiturn: bool = False,
-                 system_prompt: Optional[str] = None):
+                 system_prompt: Optional[str] = None,
+                 is_multimodal: bool = False):
         try:
             import lm_eval
         except ImportError as e:
@@ -143,6 +310,12 @@ def __init__(self,
                 "Please install the package first, e.g., `pip install lm_eval`."
             ) from e
         import lm_eval.tasks
+        self.MULTIMODAL = is_multimodal
+        if self.MULTIMODAL:
+            apply_chat_template = True
+            logger.info(
+                "Chat template automatically enabled for multimodal evaluation."
+            )
         super().__init__(random_seed=random_seed,
                          apply_chat_template=apply_chat_template,
                          fewshot_as_multiturn=fewshot_as_multiturn,
@@ -156,12 +329,31 @@ def __init__(self,
         with self._patch_lm_eval():
             self.task_dict = lm_eval.tasks.get_task_dict(
                 task_name, task_manager=task_manager)
-        # Few-shot random seed
-        self.task_dict[self.task_name].set_fewshot_seed(random_seed)
-        # Shuffle dataset
-        data = self.task_dict[self.task_name].dataset
-        for split in data.keys():
-            data[split] = data[split].shuffle(random_seed)
+
+        # Adopted from https://github.com/EleutherAI/lm-evaluation-harness/blob/7f04db12d2f8e7a99a0830d99eb78130e1ba2122/lm_eval/evaluator.py#L290
+        def _adjust_config(task_dict, random_seed):
+            adjusted_task_dict = {}
+            for task_name, task_obj in task_dict.items():
+                if isinstance(task_obj, dict):
+                    adjusted_task_dict = {
+                        **adjusted_task_dict,
+                        **{
+                            task_name: _adjust_config(task_obj, random_seed)
+                        },
+                    }
+                else:
+                    # NOTE: Few-shot random seed
+                    task_obj.set_fewshot_seed(seed=random_seed)
+                    adjusted_task_dict[task_name] = task_obj
+
+                    # NOTE: Shuffle dataset
+                    data = adjusted_task_dict[task_name].dataset
+                    for split in data.keys():
+                        data[split] = data[split].shuffle(random_seed)
+
+            return adjusted_task_dict
+
+        self.task_dict = _adjust_config(self.task_dict, random_seed)
 
     @contextmanager
     def _patch_lm_eval(self):
@@ -196,8 +388,9 @@ def evaluate(self,
                  streaming: bool = False,
                  scores_filter: str = None) -> float:
         import lm_eval
+        lm_cls = MultimodalLmEvalWrapper if self.MULTIMODAL else LmEvalWrapper
         results = lm_eval.evaluate(
-            lm=LmEvalWrapper(llm, sampling_params, streaming),
+            lm=lm_cls(llm, sampling_params, streaming),
             task_dict=self.task_dict,
             limit=self.num_samples,
             apply_chat_template=self.apply_chat_template,
@@ -226,6 +419,7 @@ def evaluate(self,
     @classmethod
     def command_harness(cls, ctx, **kwargs):
         llm: Union[LLM, PyTorchLLM] = ctx.obj
+
         evaluator = cls(dataset_path=kwargs.pop("dataset_path", None),
                         num_samples=kwargs.pop("num_samples", None),
                         random_seed=kwargs.pop("random_seed", 0),
@@ -233,10 +427,12 @@ def command_harness(cls, ctx, **kwargs):
                                                        False),
                         fewshot_as_multiturn=kwargs.pop("fewshot_as_multiturn",
                                                         False),
-                        system_prompt=kwargs.pop("system_prompt", None))
+                        system_prompt=kwargs.pop("system_prompt", None),
+                        is_multimodal=kwargs.pop("is_multimodal", False))
         sampling_params = SamplingParams(
             max_tokens=kwargs.pop("max_output_length"),
-            truncate_prompt_tokens=kwargs.pop("max_input_length"))
+            truncate_prompt_tokens=kwargs.pop("max_input_length"),
+            stop=kwargs.pop("stop", None))
         evaluator.evaluate(llm, sampling_params)
         llm.shutdown()
 
@@ -419,3 +615,52 @@ def __init__(self, **kwargs):
     @staticmethod
     def command(ctx, **kwargs) -> None:
         GPQAExtended.command_harness(ctx, **kwargs)
+
+
+class MMMU(LmEvalEvaluator):
+
+    def __init__(self, **kwargs):
+        super().__init__("mmmu_val", **kwargs)
+
+    @click.command("mmmu")
+    @click.option("--dataset_path",
+                  type=str,
+                  default=None,
+                  help="The path to MMMU dataset. "
+                  "If unspecified, the dataset is downloaded from HF hub.")
+    @click.option(
+        "--num_samples",
+        type=int,
+        default=None,
+        help="Number of samples to run the evaluation; None means full dataset."
+    )
+    @click.option("--random_seed",
+                  type=int,
+                  default=0,
+                  help="Random seed for dataset processing.")
+    @click.option(
+        "--system_prompt",
+        type=str,
+        default=None,
+        help=
+        "The system prompt to be added on the prompt. If specified, it will add {'role': 'system', 'content': system_prompt} to the prompt."
+    )
+    @click.option("--max_input_length",
+                  type=int,
+                  default=8192,
+                  help="Maximum prompt length.")
+    @click.option(
+        "--max_output_length",
+        type=int,
+        default=
+        512,  # NOTE: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mmmu/_template_yaml#L13
+        help="Maximum generation length.")
+    @click.pass_context
+    @staticmethod
+    def command(ctx, **kwargs) -> None:
+        # NOTE: MMMU is a multimodal task, so we need to set the is_multimodal and apply_chat_template flags to True
+        kwargs["is_multimodal"] = True
+        kwargs["apply_chat_template"] = True
+        kwargs[
+            "stop"] = "<|endoftext|>"  # NOTE: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mmmu/_template_yaml#L10
+        MMMU.command_harness(ctx, **kwargs)
diff --git a/tensorrt_llm/inputs/__init__.py b/tensorrt_llm/inputs/__init__.py
index f7e5ce97d7f..070b8449cee 100644
--- a/tensorrt_llm/inputs/__init__.py
+++ b/tensorrt_llm/inputs/__init__.py
@@ -8,9 +8,9 @@
 from .utils import (ALL_SUPPORTED_AUDIO_MODELS, ALL_SUPPORTED_IMAGE_MODELS,
                     ALL_SUPPORTED_MULTIMODAL_MODELS, ALL_SUPPORTED_VIDEO_MODELS,
                     ConversationMessage, MultimodalData, MultimodalDataTracker,
-                    add_multimodal_placeholders, async_load_audio,
-                    async_load_image, async_load_video,
-                    default_multimodal_input_loader,
+                    add_multimodal_placeholders, apply_chat_template,
+                    async_load_audio, async_load_image, async_load_video,
+                    convert_image_mode, default_multimodal_input_loader,
                     encode_base64_content_from_url, load_image, load_video)
 
 __all__ = [
@@ -37,6 +37,8 @@
     "async_load_image",
     "async_load_video",
     "add_multimodal_placeholders",
+    "apply_chat_template",
+    "convert_image_mode",
     "default_multimodal_input_loader",
     "encode_base64_content_from_url",
     "load_image",
diff --git a/tensorrt_llm/inputs/utils.py b/tensorrt_llm/inputs/utils.py
index 3b856a2bfbd..a5e0c0a5a42 100644
--- a/tensorrt_llm/inputs/utils.py
+++ b/tensorrt_llm/inputs/utils.py
@@ -72,11 +72,14 @@ def load_base64_image(parsed_url: str) -> Image.Image:
     return image
 
 
-def load_image(image: str,
+def load_image(image: Union[str, Image.Image],
                format: str = "pt",
                device: str = "cpu") -> Union[Image.Image, torch.Tensor]:
     assert format in ["pt", "pil"], "format must be either Pytorch or PIL"
 
+    if isinstance(image, Image.Image):
+        return image.convert('RGB')
+
     parsed_url = urlparse(image)
 
     if parsed_url.scheme in ["http", "https"]:
@@ -94,11 +97,14 @@ def load_image(image: str,
 
 
 async def async_load_image(
-        image: str,
+        image: Union[str, Image.Image],
         format: str = "pt",
         device: str = "cpu") -> Union[Image.Image, torch.Tensor]:
     assert format in ["pt", "pil"], "format must be either Pytorch or PIL"
 
+    if isinstance(image, Image.Image):
+        return image.convert('RGB')
+
     parsed_url = urlparse(image)
 
     if parsed_url.scheme in ["http", "https"]:
@@ -386,12 +392,13 @@ def resolve_hf_chat_template(
 
 def handle_placeholder_exceptions(model_type: str,
                                   conversation: list[ConversationMessage],
-                                  mm_placeholder_counts: dict[str, int]):
+                                  mm_placeholder_counts: list[dict[str, int]]):
     if model_type == "llava_next":
         # we need to convert the flattened content back to conversation format
-        for conv in conversation:
+        for conv, mm_placeholder_count in zip(conversation,
+                                              mm_placeholder_counts):
             conv["content"] = [{"type": "text", "text": conv["content"]}, \
-                *[{"type": "image"} for _ in mm_placeholder_counts]]
+                *[{"type": "image"} for _ in range(mm_placeholder_count['<image>'])]]
     else:
         raise ValueError(f"This path should not be reached for: {model_type}")
     return conversation
@@ -572,7 +579,10 @@ def convert_to_conversation_message(
             # Check if mdata is a MultimodalData
             if isinstance(mdata,
                           dict) and "modality" in mdata and "data" in mdata:
-                mm_data_tracker.add_data(mdata["modality"], mdata["data"])
+                modality = mdata["modality"]
+                if modality == "multiple_image":
+                    modality = "image"
+                mm_data_tracker.add_data(modality, mdata["data"])
             else:
                 # Add embeddings to the tracker for placeholder handling
                 mm_data_tracker.add_data(mdata["modality"],
diff --git a/tensorrt_llm/serve/chat_utils.py b/tensorrt_llm/serve/chat_utils.py
index ec67d469bc0..687799fb10a 100644
--- a/tensorrt_llm/serve/chat_utils.py
+++ b/tensorrt_llm/serve/chat_utils.py
@@ -179,6 +179,7 @@ def parse_chat_messages_coroutines(
         Any, Any, Optional[Dict[str, List[Any]]]]]]:
     """Parse multiple chat messages and return conversation and coroutine."""
     conversation = []
+    mm_placeholder_counts = []
     mm_data_tracker = MultimodalDataTracker(model_config.model_type)
 
     for msg in messages:
@@ -187,11 +188,12 @@ def parse_chat_messages_coroutines(
         if parsed_msg["media"]:
             for mdata in parsed_msg["media"]:
                 mm_data_tracker.add_data(mdata["modality"], mdata["data"])
-    mm_placeholder_counts = mm_data_tracker.placeholder_counts()
-    if mm_placeholder_counts:
-        parsed_msg["content"] = add_multimodal_placeholders(
-            model_config.model_type, parsed_msg["content"],
-            mm_placeholder_counts)
+        mm_placeholder_count = mm_data_tracker.placeholder_counts()
+        if mm_placeholder_count:
+            parsed_msg["content"] = add_multimodal_placeholders(
+                model_config.model_type, parsed_msg["content"],
+                mm_placeholder_count)
+        mm_placeholder_counts.append(mm_placeholder_count)
 
     return conversation, mm_data_tracker.retrieve_all_async(
     ), mm_placeholder_counts
diff --git a/tests/integration/defs/.test_durations b/tests/integration/defs/.test_durations
index 23a7d075d94..c85dc65bfe7 100644
--- a/tests/integration/defs/.test_durations
+++ b/tests/integration/defs/.test_durations
@@ -138,7 +138,16 @@
    "disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-True-DeepSeek-V3-Lite-fp8/fp8]": 67.32832619687542,
    "disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_llama[True-False-TinyLlama-1.1B-Chat-v1.0]": 46.302398771978915,
    "disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_llama[True-True-TinyLlama-1.1B-Chat-v1.0]": 38.81214914191514,
-   "test_unittests.py::test_unittests_v2[unittest/_torch -k \"not (modeling or multi_gpu or auto_deploy)\"]": 1186.6702785710804,
+   "test_unittests.py::test_unittests_v2[unittest/_torch/attention]": 588.56,
+   "test_unittests.py::test_unittests_v2[unittest/_torch/compilation]": 31.94,
+   "test_unittests.py::test_unittests_v2[unittest/_torch/debugger]": 36.69,
+   "test_unittests.py::test_unittests_v2[unittest/_torch/executor]": 170.86,
+   "test_unittests.py::test_unittests_v2[unittest/_torch/misc]": 600.50,
+   "test_unittests.py::test_unittests_v2[unittest/_torch/modules]": 158.50,
+   "test_unittests.py::test_unittests_v2[unittest/_torch/multimodal]": 23.54,
+   "test_unittests.py::test_unittests_v2[unittest/_torch/sampler]": 107.66,
+   "test_unittests.py::test_unittests_v2[unittest/_torch/speculative]": 1850.16,
+   "test_unittests.py::test_unittests_v2[unittest/_torch/thop]": 852.56,
    "test_unittests.py::test_unittests_v2[unittest/_torch/modeling -k \"modeling_mixtral\"]": 208.1838396479725,
    "test_unittests.py::test_unittests_v2[unittest/_torch/multi_gpu_modeling -k \"deepseek\"]": 393.0210295501165,
    "cpp/test_e2e.py::test_model[-gpt_executor-80]": 4016.7569622844458,
@@ -238,7 +247,7 @@
    "disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_llama[False-False-TinyLlama-1.1B-Chat-v1.0]": 48.16434509307146,
    "test_e2e.py::test_trtllm_bench_iteration_log[PyTorch-non-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]": 163.86223009089008,
    "test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-instruct-hf-fp8-True-True]": 115.74023819994181,
-   "test_unittests.py::test_unittests_v2[unittest/_torch -k \"modeling_llama\"]": 718.749935634085,
+   "test_unittests.py::test_unittests_v2[unittest/_torch/modeling -k \"modeling_llama\"]": 718.749935634085,
    "accuracy/test_cli_flow.py::TestGpt2::test_int8_kv_cache": 399.65961667895317,
    "accuracy/test_cli_flow.py::TestLlama3_2_1B::test_int4_awq_int8_kv_cache": 392.90223736315966,
    "accuracy/test_cli_flow.py::TestQwen2_7BInstruct::test_int4_awq_prequantized": 604.7383968606591,
@@ -280,7 +289,7 @@
    "disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0]": 67.3897166326642,
    "disaggregated/test_disaggregated.py::test_disaggregated_overlap[TinyLlama-1.1B-Chat-v1.0]": 98.97588296607137,
    "disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]": 67.9668476767838,
-   "test_unittests.py::test_unittests_v2[unittest/_torch/test_attention_mla.py]": 26.32902159006335,
+   "test_unittests.py::test_unittests_v2[unittest/_torch/attention/test_attention_mla.py]": 26.32902159006335,
    "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]": 591.2785023800097,
    "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]": 306.84709841990843,
    "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]": 220.57452515885234,
@@ -292,7 +301,6 @@
    "test_e2e.py::test_ptp_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B]": 109.26379436196294,
    "test_e2e.py::test_ptp_quickstart_advanced_mixed_precision": 80.88908524392173,
    "test_e2e.py::test_ptp_quickstart_advanced_mtp[DeepSeek-V3-Lite-BF16-DeepSeek-V3-Lite/bf16]": 99.42739840806462,
-   "test_unittests.py::test_unittests_v2[unittest/_torch/speculative/test_eagle3.py]": 317.8708840459585,
    "accuracy/test_cli_flow.py::TestLlama7B::test_auto_dtype": 402.75543826818466,
    "examples/test_bert.py::test_llm_bert_general[compare_hf-disable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:1-pp:1-float32-BertModel-bert/bert-base-uncased]": 111.17977902293205,
    "examples/test_mamba.py::test_llm_mamba_1gpu[mamba-130m-float16-enable_gemm_plugin]": 112.04011878371239,
diff --git a/tests/integration/defs/accuracy/accuracy_core.py b/tests/integration/defs/accuracy/accuracy_core.py
index 35234e42ef6..06303886292 100644
--- a/tests/integration/defs/accuracy/accuracy_core.py
+++ b/tests/integration/defs/accuracy/accuracy_core.py
@@ -342,6 +342,26 @@ class JsonModeEval(AccuracyTask):
                             apply_chat_template=True)
 
 
+class MMMU(AccuracyTask):
+    DATASET = "mmmu"
+    DATASET_DIR = f"{llm_models_root()}/datasets/MMMU"
+
+    ALPHA = 0.05
+    BETA = 0.2
+    SIGMA = 50
+    NUM_SAMPLES = 900
+
+    MAX_BATCH_SIZE = 128
+    MAX_INPUT_LEN = 8192
+    MAX_OUTPUT_LEN = 512
+
+    EVALUATOR_CLS = tensorrt_llm.evaluate.MMMU
+    EVALUATOR_KWARGS = dict(dataset_path=DATASET_DIR,
+                            random_seed=0,
+                            is_multimodal=True,
+                            apply_chat_template=True)
+
+
 class PassKeyRetrieval64k(AccuracyTask):
     DATASET = "passkey_retrieval_64k"
     LEVEL = 3
diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml
new file mode 100644
index 00000000000..6edc728210b
--- /dev/null
+++ b/tests/integration/defs/accuracy/references/mmmu.yaml
@@ -0,0 +1,2 @@
+Qwen/Qwen2-VL-7B-Instruct:
+  - accuracy: 48.44
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index f0a8e923289..8879904627e 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -31,7 +31,7 @@
                         parametrize_with_ids, skip_no_hopper,
                         skip_post_blackwell, skip_pre_ada, skip_pre_blackwell,
                         skip_pre_hopper)
-from .accuracy_core import (GSM8K, MMLU, CnnDailymail, GPQADiamond,
+from .accuracy_core import (GSM8K, MMLU, MMMU, CnnDailymail, GPQADiamond,
                             JsonModeEval, LlmapiAccuracyTestHarness)
 
 
@@ -1734,6 +1734,7 @@ class TestKimiK2(LlmapiAccuracyTestHarness):
     MODEL_PATH = f"{llm_models_root()}/Kimi-K2-Instruct"
 
     @pytest.mark.skip_less_mpi_world_size(8)
+    @skip_post_blackwell
     @skip_pre_hopper
     @pytest.mark.parametrize(
         "tp_size,pp_size,ep_size,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size",
@@ -2284,7 +2285,10 @@ def test_nvfp4(
                 pipeline_parallel_size=pp_size,
                 moe_expert_parallel_size=ep_size,
                 **pytorch_config,
-                enable_attention_dp=attention_dp) as llm:
+                enable_attention_dp=attention_dp,
+                max_batch_size=32) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
@@ -2442,11 +2446,12 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
         [
             (8, 1, 8, True, True, True, "CUTLASS", False),
             (8, 1, 8, True, True, True, "TRTLLM", False),
-            (8, 1, 8, False, False, False, "TRTLLM", True),
+            (8, 1, 8, True, True, True, "TRTLLM", True),
         ],
         ids=[
-            "latency_moe_cutlass", "latency_moe_trtllm",
-            "latency_moe_trtllm_eagle3"
+            "latency_moe_cutlass",
+            "latency_moe_trtllm",
+            "latency_moe_trtllm_eagle3",
         ],
     )
     def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
@@ -2481,6 +2486,50 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @skip_pre_blackwell
+    @pytest.mark.skip_less_mpi_world_size(4)
+    @pytest.mark.parametrize(
+        "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3",
+        [
+            (4, 1, 4, False, False, False, "TRTLLM",
+             True),  # TP8 has bug when we use TRTLLM moe backend and eagle3
+        ],
+        ids=[
+            "latency_moe_trtllm_eagle3",
+        ],
+    )
+    def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
+                         cuda_graph, overlap_scheduler, moe_backend, eagle3):
+
+        pytorch_config = dict(
+            disable_overlap_scheduler=not overlap_scheduler,
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
+            moe_config=MoeConfig(backend=moe_backend))
+
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
+                                        enable_block_reuse=not eagle3)
+        spec_config = None
+        if eagle3:
+            spec_config = EagleDecodingConfig(
+                max_draft_len=2,
+                speculative_model_dir=
+                f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/",
+                eagle3_one_model=True)
+        with LLM(
+                f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
+                tensor_parallel_size=tp_size,
+                pipeline_parallel_size=pp_size,
+                moe_expert_parallel_size=ep_size,
+                **pytorch_config,
+                enable_attention_dp=attention_dp,
+                kv_cache_config=kv_cache_config,
+                speculative_config=spec_config) as llm:
+
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
 
 class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "microsoft/Phi-4-mini-instruct"
@@ -2690,3 +2739,22 @@ def test_auto_dtype(self):
             task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
+
+
+class TestQwen2_VL_7B(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "Qwen/Qwen2-VL-7B-Instruct"
+    MODEL_PATH = f"{llm_models_root()}/Qwen2-VL-7B-Instruct"
+
+    # NOTE: MMMU adds <|endoftext|> to the stop token.
+    sampling_params = SamplingParams(max_tokens=MMMU.MAX_OUTPUT_LEN,
+                                     truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
+                                     stop="<|endoftext|>")
+
+    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
+
+    def test_auto_dtype(self):
+        with LLM(self.MODEL_PATH,
+                 max_num_tokens=16384,
+                 kv_cache_config=self.kv_cache_config) as llm:
+            task = MMMU(self.MODEL_NAME)
+            task.evaluate(llm, sampling_params=self.sampling_params)
diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt
index c427dea2bc7..e28f1bcecd4 100644
--- a/tests/integration/test_lists/qa/llm_function_full.txt
+++ b/tests/integration/test_lists/qa/llm_function_full.txt
@@ -579,7 +579,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
-accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
+accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
 accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
@@ -589,6 +589,7 @@ accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestQwen2_VL_7B::test_auto_dtype
 accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
 accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
@@ -630,7 +631,6 @@ test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-S
 test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
 test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False]
 test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video-False]
-test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False]
 test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image-False]
 test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-video-False]
 test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-image-False]
diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt
index c977a77d3c2..51c452cbc78 100644
--- a/tests/integration/test_lists/qa/llm_function_sanity.txt
+++ b/tests/integration/test_lists/qa/llm_function_sanity.txt
@@ -116,7 +116,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True]
-accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
+accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM]
@@ -164,7 +164,6 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Lla
 test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B]
 test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
 test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
-test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False]
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False]
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
diff --git a/tests/integration/test_lists/test-db/l0_a30.yml b/tests/integration/test_lists/test-db/l0_a30.yml
index 5ec16996e7c..06ccf6fb2ad 100644
--- a/tests/integration/test_lists/test-db/l0_a30.yml
+++ b/tests/integration/test_lists/test-db/l0_a30.yml
@@ -19,7 +19,7 @@ l0_a30:
   - unittest/_torch/modeling -k "modeling_qwen"
   - unittest/_torch/modeling -k "modeling_qwen_moe"
   - unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison"
-  - unittest/_torch/test_beam_search.py
+  - unittest/_torch/sampler/test_beam_search.py
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
index cb36129a147..ae0d0bd0411 100644
--- a/tests/integration/test_lists/test-db/l0_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -20,22 +20,10 @@ l0_b200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=nvfp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
@@ -66,15 +54,21 @@ l0_b200:
   - test_e2e.py::test_ptp_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B]
   - test_e2e.py::test_ptp_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct]
   - test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-False-False]
-  - unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" TIMEOUT (120)
-  - unittest/_torch -k "modeling_llama"
+  - unittest/_torch/attention
+  - unittest/_torch/compilation
+  - unittest/_torch/debugger
+  - unittest/_torch/executor
+  - unittest/_torch/misc
+  - unittest/_torch/modules
+  - unittest/_torch/multimodal
+  - unittest/_torch/sampler
+  - unittest/_torch/speculative
+  - unittest/_torch/thop
+  - unittest/_torch/modeling -k "modeling_llama"
   - unittest/_torch/modeling -k "modeling_mixtral"
   - unittest/_torch/modeling -k "modeling_deepseek"
   - unittest/_torch/modeling -k "modeling_gpt_oss"
   - unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison"
-  - unittest/_torch/speculative/test_eagle3.py
-  - unittest/_torch/speculative/test_kv_cache_reuse.py
-  - unittest/_torch/speculative/test_dynamic_spec_decode.py
 - condition:
     ranges:
       system_gpu_count:
@@ -100,7 +94,6 @@ l0_b200:
   - unittest/trt/attention/test_gpt_attention.py -k "trtllm_gen"
   - unittest/llmapi/test_llm_quant.py # 3.5 mins on B200
   - unittest/trt/functional/test_fp4_gemm.py # 3 mins on B200
-
 - condition:
     ranges:
       system_gpu_count:
@@ -117,3 +110,29 @@ l0_b200:
   - triton_server/test_triton.py::test_llava[llava]
   - triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning]
   - triton_server/test_triton.py::test_gpt_2b_ib_lora[gpt-2b-ib-lora]
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 1
+        lte: 1
+    wildcards:
+      gpu:
+      - '*b100*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  # ------------- PyTorch tests ---------------
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
diff --git a/tests/integration/test_lists/test-db/l0_gb200.yml b/tests/integration/test_lists/test-db/l0_gb200.yml
index ac39fbdc88c..7d1cc92fef5 100644
--- a/tests/integration/test_lists/test-db/l0_gb200.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200.yml
@@ -69,3 +69,4 @@ l0_gb200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] TIMEOUT (90)
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
index d4d2745e3f6..857319c44c2 100644
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
@@ -17,6 +17,5 @@ l0_gb200_multi_nodes:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] TIMEOUT (180)
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90)
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90)
diff --git a/tests/integration/test_lists/test-db/l0_gb202.yml b/tests/integration/test_lists/test-db/l0_gb202.yml
index 3026d6e449d..4d2638e95f5 100644
--- a/tests/integration/test_lists/test-db/l0_gb202.yml
+++ b/tests/integration/test_lists/test-db/l0_gb202.yml
@@ -20,7 +20,7 @@ l0_gb202:
   - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[dtype0]
   - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[dtype1]
   # - unittest/_torch/modeling -k "modeling_qwen" # https://nvbugs/5234573
-  - unittest/_torch/test_attention_mla.py
+  - unittest/_torch/attention/test_attention_mla.py
   - test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
   - test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[latency]
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
index a52b515e644..bc840823171 100644
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -14,14 +14,23 @@ l0_h100:
       backend: pytorch
   tests:
   # ------------- PyTorch tests ---------------
+  - unittest/_torch/attention
+  - unittest/_torch/compilation
+  - unittest/_torch/debugger
+  - unittest/_torch/executor
+  - unittest/_torch/misc
+  - unittest/_torch/modules
+  - unittest/_torch/multimodal
+  - unittest/_torch/sampler
+  - unittest/_torch/speculative
+  - unittest/_torch/thop
   # Only key models in H100: llama/mixtral/nemotron/deepseek
-  - unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py::test_trtllm_bench_backend_comparison
-  - unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" TIMEOUT (90)
-  - unittest/_torch -k "modeling_llama"
+  - unittest/_torch/modeling -k "modeling_llama"
   - unittest/_torch/modeling -k "modeling_mixtral"
   - unittest/_torch/modeling -k "modeling_nemotron"
   - unittest/_torch/modeling -k "modeling_gemma3"
   - unittest/_torch/modeling -k "modeling_gpt_oss"
+  - unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py::test_trtllm_bench_backend_comparison
   - unittest/disaggregated/test_disagg_utils.py
   - unittest/disaggregated/test_router.py
   - unittest/disaggregated/test_remoteDictionary.py
diff --git a/tests/integration/test_lists/test-db/l0_l40s.yml b/tests/integration/test_lists/test-db/l0_l40s.yml
index bfb7ed58ea8..622080941b2 100644
--- a/tests/integration/test_lists/test-db/l0_l40s.yml
+++ b/tests/integration/test_lists/test-db/l0_l40s.yml
@@ -37,6 +37,7 @@ l0_l40s:
   - test_e2e.py::test_openai_chat_with_logit_bias[torch_sampler]
   - test_e2e.py::test_openai_completions_with_logit_bias[trtllm_sampler]
   - test_e2e.py::test_openai_chat_with_logit_bias[trtllm_sampler]
+  - accuracy/test_llm_api_pytorch.py::TestQwen2_VL_7B::test_auto_dtype
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
index 32a03fd591c..49c29329dc7 100644
--- a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
+++ b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
@@ -17,7 +17,7 @@ l0_rtx_pro_6000:
   - unittest/_torch/modeling -k "modeling_mllama"
   - unittest/_torch/modeling -k "modeling_out_of_tree"
   # - unittest/_torch/modeling -k "modeling_qwen" # https://nvbugs/5234573
-  - unittest/_torch/test_attention_mla.py
+  - unittest/_torch/attention/test_attention_mla.py
   - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[dtype0]
   - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[dtype1]
   - test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index aed69261062..86af57819b3 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -263,7 +263,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5434451)
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451)
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451)
-accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437405,https://nvbugs/5437384)
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 SKIP (https://nvbugs/5440241)
 test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095)
 test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095)
@@ -287,13 +286,9 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=False] SKIP (https://nvbugs/5403818)
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5403818)
 test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8] SKIP (https://nvbugs/5453992)
-accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] SKIP (https://nvbugs/5454898)
-accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5454898)
+accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype SKIP (https://nvbugs/5454875)
 examples/test_llm_api_with_mpi.py::test_llm_api_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5434372)
 triton_server/test_triton.py::test_gpt_ib[gpt-ib] SKIP (https://nvbugs/5431116)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] SKIP (https://nvbugs/5457489)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5457489)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5457489)
 disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5457504)
 accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8 SKIP (https://nvbugs/5413197)
 triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (https://nvbugs/5371349)
@@ -319,3 +314,8 @@ disaggregated/test_disaggregated.py::test_disaggregated_diff_max_tokens[TinyLlam
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5465642)
 examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5431146)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] SKIP (https://nvbugs/5464461)
+disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5448449)
+full:H100/accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=True] SKIP (https://nvbugs/5467815)
+full:H100/accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=False] SKIP (https://nvbugs/5467815)
+full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True] SKIP (https://nvbugs/5467815)
+full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True] SKIP (https://nvbugs/5467815)
diff --git a/tests/unittest/_torch/test_attention.py b/tests/unittest/_torch/attention/test_attention.py
similarity index 100%
rename from tests/unittest/_torch/test_attention.py
rename to tests/unittest/_torch/attention/test_attention.py
diff --git a/tests/unittest/_torch/test_attention_mla.py b/tests/unittest/_torch/attention/test_attention_mla.py
similarity index 100%
rename from tests/unittest/_torch/test_attention_mla.py
rename to tests/unittest/_torch/attention/test_attention_mla.py
diff --git a/tests/unittest/_torch/test_attention_no_cache.py b/tests/unittest/_torch/attention/test_attention_no_cache.py
similarity index 100%
rename from tests/unittest/_torch/test_attention_no_cache.py
rename to tests/unittest/_torch/attention/test_attention_no_cache.py
diff --git a/tests/unittest/_torch/test_flashinfer_attention.py b/tests/unittest/_torch/attention/test_flashinfer_attention.py
similarity index 100%
rename from tests/unittest/_torch/test_flashinfer_attention.py
rename to tests/unittest/_torch/attention/test_flashinfer_attention.py
diff --git a/tests/unittest/_torch/test_flashinfer_star_attn.py b/tests/unittest/_torch/attention/test_flashinfer_star_attn.py
similarity index 100%
rename from tests/unittest/_torch/test_flashinfer_star_attn.py
rename to tests/unittest/_torch/attention/test_flashinfer_star_attn.py
diff --git a/tests/unittest/_torch/test_vanilla_attention.py b/tests/unittest/_torch/attention/test_vanilla_attention.py
similarity index 100%
rename from tests/unittest/_torch/test_vanilla_attention.py
rename to tests/unittest/_torch/attention/test_vanilla_attention.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py
index 6df4b188ac6..1ff956580ee 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py
@@ -600,6 +600,7 @@ def test_trtllm_bench(llm_root):  # noqa: F811
         run_benchmark(model_name, dataset_path, temp_dir)
 
 
+@pytest.mark.skip(reason="https://nvbugs/5458798")
 @pytest.mark.no_xdist
 def test_trtllm_bench_backend_comparison(llm_root):  # noqa: F811
     """Test that compares autodeploy backend performance against pytorch backend
diff --git a/tests/unittest/_torch/test_executor_request_queue.py b/tests/unittest/_torch/executor/test_executor_request_queue.py
similarity index 100%
rename from tests/unittest/_torch/test_executor_request_queue.py
rename to tests/unittest/_torch/executor/test_executor_request_queue.py
diff --git a/tests/unittest/_torch/test_overlap_scheduler.py b/tests/unittest/_torch/executor/test_overlap_scheduler.py
similarity index 100%
rename from tests/unittest/_torch/test_overlap_scheduler.py
rename to tests/unittest/_torch/executor/test_overlap_scheduler.py
diff --git a/tests/unittest/_torch/test_overlap_scheduler_input.json b/tests/unittest/_torch/executor/test_overlap_scheduler_input.json
similarity index 100%
rename from tests/unittest/_torch/test_overlap_scheduler_input.json
rename to tests/unittest/_torch/executor/test_overlap_scheduler_input.json
diff --git a/tests/unittest/_torch/test_pytorch_model_engine.py b/tests/unittest/_torch/executor/test_pytorch_model_engine.py
similarity index 100%
rename from tests/unittest/_torch/test_pytorch_model_engine.py
rename to tests/unittest/_torch/executor/test_pytorch_model_engine.py
diff --git a/tests/unittest/_torch/test_resource_manager.py b/tests/unittest/_torch/executor/test_resource_manager.py
similarity index 99%
rename from tests/unittest/_torch/test_resource_manager.py
rename to tests/unittest/_torch/executor/test_resource_manager.py
index 24320a993b3..dc67e9bbfcd 100644
--- a/tests/unittest/_torch/test_resource_manager.py
+++ b/tests/unittest/_torch/executor/test_resource_manager.py
@@ -23,7 +23,7 @@
 LoraModule = tensorrt_llm.bindings.LoraModule
 LoraModuleType = tensorrt_llm.bindings.LoraModuleType
 current_dir = pathlib.Path(__file__).parent.resolve()
-root_dir = current_dir.parent.parent.parent
+root_dir = current_dir.parent.parent.parent.parent
 
 sys.path.append(str(root_dir / "tests" / "integration"))
 
@@ -44,6 +44,8 @@ def setUpClass(cls):
         """
         cpp_script_dir = os.path.join(cls.CPP_RESOURCES_DIR, "scripts")
 
+        # No reason to run this script for each test.
+        # TODO: move this to a fixture that runs once.
         generate_lora_data_args_tp1 = [
             sys.executable,
             f"{cpp_script_dir}/generate_test_lora_weights.py",
diff --git a/tests/unittest/_torch/test_autotuner.py b/tests/unittest/_torch/misc/test_autotuner.py
similarity index 100%
rename from tests/unittest/_torch/test_autotuner.py
rename to tests/unittest/_torch/misc/test_autotuner.py
diff --git a/tests/unittest/_torch/test_share_tensor.py b/tests/unittest/_torch/misc/test_share_tensor.py
similarity index 100%
rename from tests/unittest/_torch/test_share_tensor.py
rename to tests/unittest/_torch/misc/test_share_tensor.py
diff --git a/tests/unittest/_torch/test_virtual_memory.py b/tests/unittest/_torch/misc/test_virtual_memory.py
similarity index 100%
rename from tests/unittest/_torch/test_virtual_memory.py
rename to tests/unittest/_torch/misc/test_virtual_memory.py
diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py
index 2d11971d99e..397314bcab0 100644
--- a/tests/unittest/_torch/modules/test_fused_moe.py
+++ b/tests/unittest/_torch/modules/test_fused_moe.py
@@ -289,6 +289,7 @@ def per_rank_test_fused_moe_alltoall(job_id):
             assert r is None
 
 
+@pytest.mark.skip(reason="https://nvbugs/5467531")
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="needs 4 GPUs to run this test")
 @pytest.mark.parametrize("alltoall_method_type", [
diff --git a/tests/unittest/_torch/test_group_rmn_norm.py b/tests/unittest/_torch/modules/test_group_rmn_norm.py
similarity index 100%
rename from tests/unittest/_torch/test_group_rmn_norm.py
rename to tests/unittest/_torch/modules/test_group_rmn_norm.py
diff --git a/tests/unittest/_torch/test_mnnvl_memory.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_memory.py
similarity index 100%
rename from tests/unittest/_torch/test_mnnvl_memory.py
rename to tests/unittest/_torch/multi_gpu/test_mnnvl_memory.py
diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py
index 5c374d0f2aa..6149201d582 100644
--- a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py
+++ b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py
@@ -1,6 +1,7 @@
 from difflib import SequenceMatcher
 
 import pytest
+import torch
 from utils.llm_data import llm_models_root
 
 from tensorrt_llm import LLM, SamplingParams
@@ -43,19 +44,17 @@ def test_llama4(model_name, backend, tp_size, use_cuda_graph,
             "This is a very long prompt to exercise long context. Count up to 10000 from 1, 2, 3,"
             + ", ".join(str(i) for i in range(4, 9000))
         },
-        # TODO: Fix multimodal test.
-        # {
-        #     "prompt": "<|image|>This image is of color",
-        #     "multi_modal_data": {
-        #         "image": [torch.ones(3, 1024, 1024)]
-        #     }
-        # },
+        {
+            "prompt": "<|image|>This image is of color",
+            "multi_modal_data": {
+                "image": [torch.ones(3, 1024, 1024)]
+            }
+        },
     ]
 
     expected_outputs = [
-        " the head of state and head of government of the",
-        ", 9000, 9001, ",
-        # " white. What is the color of the background of"  # TODO: Fix multimodal test.
+        " the head of state and head of government of the", ", 9000, 9001, ",
+        " white. What is the color of the background of"
     ]
 
     pytorch_config = dict(attn_backend=backend)
diff --git a/tests/unittest/_torch/test_beam_search.py b/tests/unittest/_torch/sampler/test_beam_search.py
similarity index 100%
rename from tests/unittest/_torch/test_beam_search.py
rename to tests/unittest/_torch/sampler/test_beam_search.py
diff --git a/tests/unittest/_torch/test_best_of_n.py b/tests/unittest/_torch/sampler/test_best_of_n.py
similarity index 100%
rename from tests/unittest/_torch/test_best_of_n.py
rename to tests/unittest/_torch/sampler/test_best_of_n.py
diff --git a/tests/unittest/_torch/test_return_logits.py b/tests/unittest/_torch/sampler/test_return_logits.py
similarity index 100%
rename from tests/unittest/_torch/test_return_logits.py
rename to tests/unittest/_torch/sampler/test_return_logits.py
diff --git a/tests/unittest/_torch/test_trtllm_sampler.py b/tests/unittest/_torch/sampler/test_trtllm_sampler.py
similarity index 82%
rename from tests/unittest/_torch/test_trtllm_sampler.py
rename to tests/unittest/_torch/sampler/test_trtllm_sampler.py
index d2fb0e9e65c..37227f9b53f 100644
--- a/tests/unittest/_torch/test_trtllm_sampler.py
+++ b/tests/unittest/_torch/sampler/test_trtllm_sampler.py
@@ -1,6 +1,3 @@
-import json
-from pathlib import Path
-
 import pytest
 from utils.llm_data import llm_models_root
 from utils.util import similar
@@ -10,13 +7,6 @@
 from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
 
 
-# A test case of mmlu_llama from lm_eval
-@pytest.fixture(scope="module")
-def test_case():
-    with open(Path(__file__).parent / "test_overlap_scheduler_input.json") as f:
-        return json.load(f)
-
-
 @pytest.fixture(scope="module")
 def model_path():
     return llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
@@ -39,7 +29,7 @@ def create_llm(model_dir):
 
 
 @pytest.mark.high_cuda_memory
-def test_trtllm_sampler(model_path, test_case):
+def test_trtllm_sampler(model_path):
     prompts = [
         "Magellan and Elcano lead the first",
         "The capital of France is",
@@ -50,10 +40,10 @@ def test_trtllm_sampler(model_path, test_case):
                         ["La Paz"]]
 
     # Test configuration
-    max_new_tokens = test_case["max_new_tokens"]
-    temperature = test_case["temperature"]
-    top_p = test_case["top_p"]
-    stop_words = test_case["stop_words"]
+    max_new_tokens = 10
+    temperature = 1.0
+    top_p = None
+    stop_words = ["."]
 
     sampling_config = SamplingParams(max_tokens=max_new_tokens,
                                      n=1,
diff --git a/tests/unittest/_torch/test_custom_ops.py b/tests/unittest/_torch/thop/test_custom_ops.py
similarity index 100%
rename from tests/unittest/_torch/test_custom_ops.py
rename to tests/unittest/_torch/thop/test_custom_ops.py
diff --git a/tests/unittest/_torch/test_fp8_per_tensor_scale_tllmg_gemm.py b/tests/unittest/_torch/thop/test_fp8_per_tensor_scale_tllmg_gemm.py
similarity index 100%
rename from tests/unittest/_torch/test_fp8_per_tensor_scale_tllmg_gemm.py
rename to tests/unittest/_torch/thop/test_fp8_per_tensor_scale_tllmg_gemm.py