Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
acef6e8
test pass
zhou-yuxin Aug 21, 2025
27ed25a
pre-commit format
zhou-yuxin Aug 21, 2025
cdd456f
test pass
zhou-yuxin Aug 21, 2025
1fe8273
pre-commit format
zhou-yuxin Aug 21, 2025
4338e08
[None][chore] Update namelist in blossom-ci (#7015)
karljang Aug 20, 2025
d271157
[None][ci] move unittests to sub-directories (#6635)
Funatiq Aug 20, 2025
89747a8
[None][infra] Waive failed tests on main branch 8/20 (#7092)
EmmaQiaoCh Aug 20, 2025
8d445a4
[None][fix] Fix W4A8 MoE kernel issue (#7072)
yuhyao Aug 20, 2025
d36ba89
[TRTLLM-7348] [feat] Enable Cross-Attention to use XQA kernels for Wh…
DomBrown Aug 20, 2025
5920e6e
[None][chore] Only check the bindings lib for current build (#7026)
liji-nv Aug 20, 2025
54bc8fd
[None][ci] move some tests of b200 to post merge (#7093)
QiJune Aug 20, 2025
29aee2a
[https://nvbugs/5457489][fix] unwaive some tests (#6991)
byshiue Aug 21, 2025
890dda6
[TRTLLM-6771][feat] Support MMMU for multimodal models (#6828)
yechank-nvidia Aug 21, 2025
ab3153c
[None][fix] Fix llama4 multimodal by skipping request validation (#6957)
chang-l Aug 21, 2025
1b3709e
[None][infra] Upgrade UCX to v1.19.x and NIXL to 0.5.0 (#7024)
BatshevaBlack Aug 21, 2025
d84e1c7
[None][fix] update accelerate dependency to 1.7+ for AutoDeploy (#7077)
Fridah-nv Aug 21, 2025
c923ba7
[None][fix] Fix const modifier inconsistency in log function declarat…
Fan-Yunfan Aug 21, 2025
8233dda
[None][chore] waive failed cases on H100 (#7084)
xinhe-nv Aug 21, 2025
176f367
[fix]: use safeInitRowMax instead of fp32_lowest to avoid NaN (#7087)
lowsfer Aug 21, 2025
cc35ba2
[https://nvbugs/5443039][fix] Fix AutoDeploy pattern matcher for torc…
Fridah-nv Aug 21, 2025
9631242
[https://nvbugs/5437405][fix] qwen3 235b eagle3 ci (#7000)
byshiue Aug 21, 2025
810beb2
[None][doc] Update gpt-oss deployment guide to latest release image (…
farshadghodsian Aug 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@
/tests/unittest/_torch/compilation @NVIDIA/trt-llm-torch-graph-compiler
/tests/unittest/_torch/multi_gpu/test_ar_residual_norm.py @NVIDIA/trt-llm-torch-graph-compiler
/tests/unittest/_torch/multi_gpu/test_user_buffers.py @NVIDIA/trt-llm-torch-graph-compiler
/tests/unittest/_torch/test_custom_ops.py @NVIDIA/trt-llm-torch-graph-compiler
/tests/unittest/_torch/test_autotuner.py @NVIDIA/trt-llm-torch-graph-compiler
/tests/unittest/_torch/thop/test_custom_ops.py @NVIDIA/trt-llm-torch-graph-compiler
/tests/unittest/_torch/misc/test_autotuner.py @NVIDIA/trt-llm-torch-graph-compiler

## TensorRT-LLM Pytorch - Attention
/tensorrt_llm/_torch/attention_backend @NVIDIA/trt-llm-torch-attention-devs
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/blossom-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
startsWith(github.event.comment.body, '/bot skip --comment') ||
startsWith(github.event.comment.body, '/bot reuse-pipeline') ||
startsWith(github.event.comment.body, '/bot kill')) && contains(
fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jhaotingc","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727","zackyoray","RoeyAzran1992","viraatc","v-shobhit","yuanjingx87","uchihatmtkinu","nvrohanv","vegaluisjose","qsang-nv","ChunhuanLin","timlee0212","venkywonka","zbpatel","tijyojwad","shyeh25","zihaok","nv-yilinf","ttyio","farazkh80","yuantailing","JennyLiu-nv","moraxu","IzzyPutterman","nvchenghaoz","nvxuanyuc","poweiw","stnie","zhanga5","nzmora-nvidia","greg-kwasniewski1","linda-stadter","Tom-Zheng","vanshilshah97","ixlmar","MatthiasKohl","Wanli-Jiang", "arekay", "davidclark-nv", "2ez4bz", "tcherckez-nvidia", "MrGeva", "galagam", "limin2021", "dhansen-nvidia","talorabr","kanghui0204","wu6u3tw","hvagadia","xavier-nvidia","raayandhar","dbari","nvjullin","elvischenv","zhenhuaw-me","weireweire","yifeizhang-c","jiaganc","ziyixiong-nv","FelixXidddd","JunyiXu-nv","bo-nv","zerollzeng","RayenTian","ameynaik-hub","raymochen","shuyixiong","johncalesp","leslie-fang25","reasonsolo","zhou-yuxin","vadiklyutiy","yali-arch","NVShreyas","h-guo18","pengbowang-nv","lancelly","heyuhhh","mayani-nv","flin3500","sunnyqgg","kris1025"]'),
fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jhaotingc","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727","zackyoray","RoeyAzran1992","viraatc","v-shobhit","yuanjingx87","uchihatmtkinu","nvrohanv","vegaluisjose","qsang-nv","ChunhuanLin","timlee0212","venkywonka","zbpatel","tijyojwad","shyeh25","zihaok","nv-yilinf","ttyio","farazkh80","yuantailing","JennyLiu-nv","moraxu","IzzyPutterman","nvchenghaoz","nvxuanyuc","poweiw","stnie","zhanga5","nzmora-nvidia","greg-kwasniewski1","linda-stadter","Tom-Zheng","vanshilshah97","ixlmar","MatthiasKohl","Wanli-Jiang", "arekay", "davidclark-nv", "2ez4bz", "tcherckez-nvidia", "MrGeva", "galagam", "limin2021", "dhansen-nvidia","talorabr","kanghui0204","wu6u3tw","hvagadia","xavier-nvidia","raayandhar","dbari","nvjullin","elvischenv","zhenhuaw-me","weireweire","yifeizhang-c","jiaganc","ziyixiong-nv","FelixXidddd","JunyiXu-nv","bo-nv","zerollzeng","RayenTian","ameynaik-hub","raymochen","shuyixiong","johncalesp","leslie-fang25","reasonsolo","zhou-yuxin","vadiklyutiy","yali-arch","NVShreyas","h-guo18","pengbowang-nv","lancelly","heyuhhh","mayani-nv","flin3500","sunnyqgg","kris1025", "karljang"]'),
github.actor)
steps:
- name: Check if comment is issued by authorized person
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,9 @@ TensorRT-LLM
<div align="left">

## Tech Blogs
* [08/06] Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
* [08/05] Running a High-Performance GPT-OSS-120B Inference Server with TensorRT-LLM
✨ [➡️ link](./docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md)


* [08/01] Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
✨ [➡️ link](./docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md)

Expand All @@ -44,6 +43,7 @@ TensorRT-LLM
✨ [➡️ link](./docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md)

## Latest News
* [08/05] 🌟 TensorRT-LLM delivers Day-0 support for OpenAI's latest open-weights models: GPT-OSS-120B [➡️ link](https://huggingface.co/openai/gpt-oss-120b) and GPT-OSS-20B [➡️ link](https://huggingface.co/openai/gpt-oss-20b)
* [07/15] 🌟 TensorRT-LLM delivers Day-0 support for LG AI Research's latest model, EXAONE 4.0 [➡️ link](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B)
* [06/17] Join NVIDIA and DeepInfra for a developer meetup on June 26 ✨ [➡️ link](https://events.nvidia.com/scaletheunscalablenextgenai)
* [05/22] Blackwell Breaks the 1,000 TPS/User Barrier With Meta’s Llama 4 Maverick
Expand Down
13 changes: 7 additions & 6 deletions cpp/include/tensorrt_llm/common/logger.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,20 +54,21 @@ class Logger

#if defined(_MSC_VER)
template <typename... Args>
void log(Level level, char const* format, Args const&... args);
void log(Level const level, char const* format, Args const&... args);

template <typename... Args>
void log(Level level, int rank, char const* format, Args const&... args);
void log(Level const level, int const rank, char const* format, Args const&... args);
#else
template <typename... Args>
void log(Level level, char const* format, Args const&... args) __attribute__((format(printf, 3, 0)));
void log(Level const level, char const* format, Args const&... args) __attribute__((format(printf, 3, 0)));

template <typename... Args>
void log(Level level, int rank, char const* format, Args const&... args) __attribute__((format(printf, 4, 0)));
void log(Level const level, int const rank, char const* format, Args const&... args)
__attribute__((format(printf, 4, 0)));
#endif

template <typename... Args>
void log(Level level, std::string const& format, Args const&... args)
void log(Level const level, std::string const& format, Args const&... args)
{
return log(level, format.c_str(), args...);
}
Expand Down Expand Up @@ -134,7 +135,7 @@ class Logger
};

template <typename... Args>
void Logger::log(Logger::Level level, char const* format, Args const&... args)
void Logger::log(Logger::Level const level, char const* format, Args const&... args)
{
if (isEnabled(level))
{
Expand Down
4 changes: 2 additions & 2 deletions cpp/kernels/fmha_v2/fmha_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,8 @@ def test_trtllm_context_mla_attention_fmha(dtype, s):
if dtype == "-bf16" and s == 4096:
epsilon += ' -epsilon 0.03'

if dtype in ["-e4m3", "-e4m3 -bf16-output"] and sm_version != 120:
pytest.skip("FP8 MLAs are only supported on sm120 currently.")
if dtype in ["-e4m3", "-e4m3 -bf16-output"] and sm_version not in [90, 120]:
pytest.skip("FP8 MLAs are only supported on sm90 and sm120 currently.")

# Context phase kernels, always use separate-q-k-v layout.
subprocess.run(
Expand Down
51 changes: 47 additions & 4 deletions cpp/kernels/fmha_v2/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1914,8 +1914,9 @@ def enable_mutex(kspec):


def enable_tma_store(kspec):
output_dtype = kspec.output_dtype if kspec.output_dtype is not None else kspec.dtype
# TMA copies data in the 16B granularity.
return 'true' if (kspec.dtype in ['e4m3', 'e4m3_fp32']
return 'true' if (output_dtype in ['e4m3', 'e4m3_fp32']
and kspec.head_size % 16 == 0) else 'false'


Expand Down Expand Up @@ -3812,7 +3813,9 @@ def enumerate_qgmma_flash_warpspec_kernels(specs,
# use specialized kernels for cases without alibi scales.
# there is a numeric issues when applying the exp2f scale optimization and alibi scale at the same time.
combinations = product([False, True], \
[InputLayout.PACKED_QKV, InputLayout.CONTIGUOUS_Q_KV, InputLayout.Q_PAGED_KV], [False, True])
[InputLayout.PACKED_QKV, InputLayout.CONTIGUOUS_Q_KV,
InputLayout.Q_PAGED_KV, InputLayout.SEPARATE_Q_K_V],
[False, True])
for (alibi, input_layout, enable_attn_logit_softcapping) in combinations:
# alibi and bmm1_tanh_scale shouldn't be used together.
if alibi and enable_attn_logit_softcapping:
Expand Down Expand Up @@ -3911,7 +3914,7 @@ def enumerate_qgmma_flash_warpspec_kernels(specs,
has_noloop=0,
noloop_step=64,
kv_loop_step=
128, # use 64 kv step size to avoid register spilling
128, # use 128 kv step size to avoid register spilling
kv_tile_buffers=2, # only used by warp specialized kernels
unroll_threshold=1,
has_scale_max=False,
Expand All @@ -3926,6 +3929,46 @@ def enumerate_qgmma_flash_warpspec_kernels(specs,
sage_block_sizes=sage_block_sizes,
output_dtype=output_dtype))

# context MLA (192x128)
# we could use param 'output_dtype' of enumerate_qgmma_flash_warpspec_kernels(),
# but it will generate many unnecessary kernels and they are not easy to filter out.
for output_type in [None, 'bf16']:
specs.append(
kernel_spec(
sm=sm,
sm_mma=90,
dtype=dtype,
seq_len=0, # support any sequence length
head_size=192,
head_size_v=128,
warps_m=4, #4x1 warpgroups
warps_n=1,
version=2,
interleaved=False,
ldgsts_q=
False, # for Hopper kernels, ldgsts = False signals TMA usage.
ldgsts_k=False,
ldgsts_v=False,
share_smem_k_v=False,
loop_step=64,
q_tile_buffers=1, # only used by warp specialized kernels
has_noloop=0,
noloop_step=64,
kv_loop_step=128,
kv_tile_buffers=2, # only used by warp specialized kernels
unroll_threshold=1,
has_scale_max=False,
flash_attention=True,
warp_specialization=True,
alibi=alibi,
enable_attn_logit_softcapping=enable_attn_logit_softcapping,
return_softmax_stats=
False, # return softmax stats is not supported for fp8 now
scheduling_mode=scheduling_mode,
input_layout=input_layout,
sage_block_sizes=sage_block_sizes,
output_dtype=output_type))


def enumerate_igmma_kernels(specs, sm=90):
specs.append(
Expand Down Expand Up @@ -6377,7 +6420,7 @@ def enumerate_kernels():
and kspec.tiled == True)
# Deepseek MLA (context 192/128 separate-q-k-v)
or (kspec.sm in [90, 100, 120]
and kspec.dtype in ['bf16', 'e4m3_fp32']
and kspec.dtype in ['bf16', 'e4m3', 'e4m3_fp32']
and kspec.head_size == 192
and kspec.head_size_v == 128
and kspec.input_layout == InputLayout.SEPARATE_Q_K_V
Expand Down
41 changes: 28 additions & 13 deletions cpp/kernels/fmha_v2/src/fmha/hopper/gmem_tile_o_packed.h
Original file line number Diff line number Diff line change
Expand Up @@ -1222,6 +1222,14 @@ struct Gmem_tile_o_qgmma_fp32_16bits
inline __device__ Gmem_tile_o_qgmma_fp32_16bits(
Params const& params, Block_info const& block_info, Shared&&, int tidx, int cta_row_offset = 0)
: params_o_stride_in_bytes_(params.o_stride_in_bytes)
, params_scale_bmm2_(
#ifdef GENERATE_CUBIN
// Specialized for trt-llm generated cubins only.
params.scale_bmm2_d ? *params.scale_bmm2_d : params.scale_bmm2
#else
params.scale_bmm2
#endif
)
, actual_seqlen_(block_info.actual_seqlen)
, o_ptr_(reinterpret_cast<char*>(params.o_ptr))
{
Expand Down Expand Up @@ -1251,21 +1259,24 @@ struct Gmem_tile_o_qgmma_fp32_16bits
inline __device__ void store(Accumulators const (&acc)[M][N])
{
int64_t const step_m = 8 * params_o_stride_in_bytes_;
// we assume M = 1. some shortcuts.
static_assert(M == 1);

#define STORE_COLUMN(idx) \
{ \
float _reg0 = acc[0][mma_ni].elt(((ci + 0) * ROWS_PER_THREAD + ri) * 2 + idx); \
float _reg1 = acc[0][mma_ni].elt(((ci + 1) * ROWS_PER_THREAD + ri) * 2 + idx); \
static_assert(std::is_same_v<Output_type, bf16_t> || std::is_same_v<Output_type, fp16_t>); \
uint32_t _out = fmha::float2_to_16bit_2<Output_type>(_reg0, _reg1); \
int64_t _offset = (int64_t) ri * step_m + (int64_t) (ci + mma_ni * COLS_PER_THREAD) * STEP_N; \
fmha::stg(o_ptr_ + _offset + 4 * idx, _out); \
}
#ifdef UNIFIED_EPILOGUE_SCALE
constexpr bool Scale = false;
#else
constexpr bool Scale = true;
#endif
#define STORE_COLUMNS() \
{ \
STORE_COLUMN(0) STORE_COLUMN(1) \
/* we assume M = 1. some shortcuts. */ \
static_assert(M == 1); \
uint4 _src = { \
.x = acc[0][mma_ni].reg(((ci + 0) * ROWS_PER_THREAD + ri) * 2), \
.y = acc[0][mma_ni].reg(((ci + 1) * ROWS_PER_THREAD + ri) * 2), \
.z = acc[0][mma_ni].reg(((ci + 0) * ROWS_PER_THREAD + ri) * 2 + 1), \
.w = acc[0][mma_ni].reg(((ci + 1) * ROWS_PER_THREAD + ri) * 2 + 1), \
}; \
uint2 _dst = Acc_packer<float, Output_type, Scale>::run(this, _src); \
int64_t _offset = (int64_t) ri * step_m + (int64_t) (ci + mma_ni * COLS_PER_THREAD) * STEP_N; \
fmha::stg(o_ptr_ + _offset, _dst); \
}

#pragma unroll
Expand Down Expand Up @@ -1303,6 +1314,10 @@ struct Gmem_tile_o_qgmma_fp32_16bits

// The stride between rows for the QKV matrice.
int64_t params_o_stride_in_bytes_;
// Scaling factor; this usually means QKV descale factor in actuality
uint32_t params_scale_bmm2_;
// Scaling factor; this usually means QKV descale factor in actuality
uint32_t params_scale_bmm2_;
// The pointer.
char* o_ptr_;
// The row loaded by this thread.
Expand Down
Loading