Skip to content

Commit e9cd810

Browse files
authored
keep sm90 headsize 128 cubins (#5320)
Signed-off-by: Qidi Sang <[email protected]>
1 parent 6aef149 commit e9cd810

File tree

200 files changed

+1360
-2170
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

200 files changed

+1360
-2170
lines changed

cpp/kernels/fmha_v2/README.md

Lines changed: 9 additions & 1 deletion

cpp/kernels/fmha_v2/setup.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3049,14 +3049,20 @@ def get_kernel_traits_code(specs_names):
30493049
return code
30503050

30513051

3052+
# For now, only hopper head_size 128 kernel uses cubins, and other kernels use cu files.
3053+
# You should set the condition `use_cubin_header` to false if you have modified the source code of the FMHA kernels on Hopper (sm90) with head_size 128.
3054+
# This ensures that the kernels will be recompiled using the updated source code rather than relying on precompiled cubins.
3055+
def use_cubin_header(kspec):
3056+
return kspec.sm == 90 and kspec.head_size == 128
3057+
3058+
30523059
def get_cubin_header(kernel_traits, specs_names):
30533060
cubins = []
30543061
cubin_lens = []
30553062
cubins_dict = {}
30563063
cubin_lens_dict = {}
30573064
for kspec, fname, lname, kname in specs_names:
3058-
# only generate hopper cubin header
3059-
if generate_cu_trtllm and not 'sm90' in kname:
3065+
if generate_cu_trtllm and not use_cubin_header(kspec):
30603066
continue
30613067
name = fname.replace('.', '_')
30623068
data = 'extern unsigned char cubin_{name}_cubin[];'.format(name=name)
@@ -3209,7 +3215,7 @@ def get_cubin_header(kernel_traits, specs_names):
32093215
if generate_cu_trtllm:
32103216

32113217
def get_lname_from_kname(kname: str) -> str:
3212-
if 'sm90' in kname:
3218+
if use_cubin_header(kspec):
32133219
return 'nullptr'
32143220
lname = kname.replace('_kernel', '')
32153221
mask_types = [
@@ -3228,7 +3234,7 @@ def get_lname_from_kname(kname: str) -> str:
32283234
{cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \
32293235
{attention_input_layout_value}, {is_il}, {is_flash_atten}, {is_warp_specialization}, {is_fp32_accu}, \
32303236
{is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {lname}}}\
3231-
'''.format(**locals()) if 'sm90' in kname else '''\
3237+
'''.format(**locals()) if use_cubin_header(kspec) else '''\
32323238
{{ DATA_TYPE_{prec}, DATA_TYPE_{output_prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, {head_size_v}, \
32333239
{sage_block_sizes[0]}, {sage_block_sizes[1]}, {sage_block_sizes[2]}, kSM_{sm}, nullptr, \
32343240
0, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \
@@ -3404,6 +3410,9 @@ def get_lname_from_kname(kname: str) -> str:
34043410
return code
34053411

34063412

3413+
# This is used to add some kernels running in cubins.
3414+
# The source code of paged context fmha kernels are not in this repo, but we have cubins for them.
3415+
# Other kernels are for passing CI cases.
34073416
def modify_cubin_header(cubin_header):
34083417
# for paged context fmha cases
34093418
target = "#ifndef EXCLUDE_SM_90"

cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h

Lines changed: 1338 additions & 1574 deletions
Large diffs are not rendered by default.

cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_256_32_ldgsts_sm90.cubin.cpp

Lines changed: 0 additions & 3 deletions
This file was deleted.

cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_256_64_ldgsts_sm90.cubin.cpp

Lines changed: 0 additions & 3 deletions
This file was deleted.

cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_384_32_ldgsts_sm90.cubin.cpp

Lines changed: 0 additions & 3 deletions
This file was deleted.

cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_384_64_ldgsts_sm90.cubin.cpp

Lines changed: 0 additions & 3 deletions
This file was deleted.

cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_512_32_ldgsts_sm90.cubin.cpp

Lines changed: 0 additions & 3 deletions
This file was deleted.

cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_512_64_ldgsts_sm90.cubin.cpp

Lines changed: 0 additions & 3 deletions
This file was deleted.

cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_64_32_ldgsts_sm90.cubin.cpp

Lines changed: 0 additions & 3 deletions
This file was deleted.

0 commit comments

Comments
 (0)