min16float3 support bug? #7013

fengliancanxue · 2024-11-20T02:10:35Z

Description
When i test min16float3 support without -enable-16bit-types , the dxil says the size of min16float is 2 bytes and algiment is 4 bytes, but when it uses
%dx.types.CBufRet.f16 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %Foo_cbuffer, i32 0) to load the data, in theory the adress would be index * 4 bytes(alignment)，but when i test the shader on rtx 4080, the adress is caculated with index * 2byes(min16float size), is this a bug or am missing something?

Steps to Reproduce
shader source:

cbuffer Foo
{
	min16float a;
	min16float c;
	min16float3 e;
	float b;
}

void MainVS(
           out float4 pos : SV_Position
           )
{
	pos = float4(e, a);
}

dxil:

;
; Note: shader requires additional functionality:
;       Minimum-precision data types
;
;
; Input signature:
;
; Name                 Index   Mask Register SysValue  Format   Used
; -------------------- ----- ------ -------- -------- ------- ------
; no parameters
;
; Output signature:
;
; Name                 Index   Mask Register SysValue  Format   Used
; -------------------- ----- ------ -------- -------- ------- ------
; SV_Position              0   xyzw        0      POS   float   xyzw
;
; shader debug name: be1be8c76f396b17ddcad84edb41ff8f.pdb
; shader hash: be1be8c76f396b17ddcad84edb41ff8f
;
; Pipeline Runtime Information: 
;
; Vertex Shader
; OutputPositionPresent=1
;
;
; Output signature:
;
; Name                 Index             InterpMode DynIdx
; -------------------- ----- ---------------------- ------
; SV_Position              0          noperspective       
;
; Buffer Definitions:
;
; cbuffer Foo
; {
;
;   struct hostlayout.Foo
;   {
;
;       min16float a;                                 ; Offset:    0
;       min16float c;                                 ; Offset:    4
;       min16float3 e;                                ; Offset:   16
;       float b;                                      ; Offset:   28
;   
;   } Foo;                                            ; Offset:    0 Size:    32
;
; }
;
;
; Resource Bindings:
;
; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
; ------------------------------ ---------- ------- ----------- ------- -------------- ------
; Foo                               cbuffer      NA          NA     CB0            cb0     1
;
;
; ViewId state:
;
; Number of inputs: 0, outputs: 4
; Outputs dependent on ViewId: {  }
; Inputs contributing to computation of Outputs:
;
target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
target triple = "dxil-ms-dx"

%dx.types.Handle = type { i8* }
%dx.types.CBufRet.f16 = type { half, half, half, half }
%hostlayout.Foo = type { float, float, <3 x float>, float }

define void @MainVS() {
  %Foo_cbuffer = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 0, i1 false), !dbg !70 ; line:10 col:23  ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
  %1 = call %dx.types.CBufRet.f16 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %Foo_cbuffer, i32 1), !dbg !71 ; line:13 col:15  ; CBufferLoadLegacy(handle,regIndex)
  %2 = extractvalue %dx.types.CBufRet.f16 %1, 0, !dbg !71 ; line:13 col:15
  %3 = extractvalue %dx.types.CBufRet.f16 %1, 1, !dbg !71 ; line:13 col:15
  %4 = extractvalue %dx.types.CBufRet.f16 %1, 2, !dbg !71 ; line:13 col:15
  %5 = call %dx.types.CBufRet.f16 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %Foo_cbuffer, i32 0), !dbg !72 ; line:13 col:18  ; CBufferLoadLegacy(handle,regIndex)
  %6 = extractvalue %dx.types.CBufRet.f16 %5, 0, !dbg !72 ; line:13 col:18
  %7 = fpext half %6 to float, !dbg !72 ; line:13 col:18
  %8 = fpext half %2 to float, !dbg !73 ; line:13 col:14
  %9 = fpext half %3 to float, !dbg !73 ; line:13 col:14
  %10 = fpext half %4 to float, !dbg !73 ; line:13 col:14
  call void @llvm.dbg.value(metadata float %8, i64 0, metadata !74, metadata !75), !dbg !76 ; var:"pos" !DIExpression(DW_OP_bit_piece, 0, 32) func:"MainVS"
  call void @llvm.dbg.value(metadata float %9, i64 0, metadata !74, metadata !77), !dbg !76 ; var:"pos" !DIExpression(DW_OP_bit_piece, 32, 32) func:"MainVS"
  call void @llvm.dbg.value(metadata float %10, i64 0, metadata !74, metadata !78), !dbg !76 ; var:"pos" !DIExpression(DW_OP_bit_piece, 64, 32) func:"MainVS"
  call void @llvm.dbg.value(metadata float %7, i64 0, metadata !74, metadata !79), !dbg !76 ; var:"pos" !DIExpression(DW_OP_bit_piece, 96, 32) func:"MainVS"
  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float %8), !dbg !76 ; line:13 col:6  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float %9), !dbg !76 ; line:13 col:6  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 2, float %10), !dbg !76 ; line:13 col:6  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 3, float %7), !dbg !76 ; line:13 col:6  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  ret void, !dbg !80 ; line:14 col:1
}

; Function Attrs: nounwind readnone
declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #0

; Function Attrs: nounwind
declare void @dx.op.storeOutput.f32(i32, i32, i32, i8, float) #1

; Function Attrs: nounwind readonly
declare %dx.types.CBufRet.f16 @dx.op.cbufferLoadLegacy.f16(i32, %dx.types.Handle, i32) #2

; Function Attrs: nounwind readonly
declare %dx.types.Handle @dx.op.createHandle(i32, i8, i32, i32, i1) #2

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
attributes #2 = { nounwind readonly }

!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!41, !42}
!llvm.ident = !{!43}
!dx.source.contents = !{!44}
!dx.source.defines = !{!2}
!dx.source.mainFileName = !{!45}
!dx.source.args = !{!46}
!dx.version = !{!47}
!dx.valver = !{!48}
!dx.shaderModel = !{!49}
!dx.resources = !{!50}
!dx.typeAnnotations = !{!53, !59}
!dx.viewIdState = !{!62}
!dx.entryPoints = !{!63}

!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "dxc(private) 1.8.0.0 (private, 00000000)", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !3, subprograms: !16, globals: !22)
!1 = !DIFile(filename: "VisibilityVertexShader.usf", directory: "")
!2 = !{}
!3 = !{!4}
!4 = !DIDerivedType(tag: DW_TAG_typedef, name: "float4", file: !1, line: 6, baseType: !5)
!5 = !DICompositeType(tag: DW_TAG_class_type, name: "vector<float, 4>", file: !1, line: 6, size: 128, align: 32, elements: !6, templateParams: !12)
!6 = !{!7, !9, !10, !11}
!7 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !5, file: !1, line: 6, baseType: !8, size: 32, align: 32, flags: DIFlagPublic)
!8 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
!9 = !DIDerivedType(tag: DW_TAG_member, name: "y", scope: !5, file: !1, line: 6, baseType: !8, size: 32, align: 32, offset: 32, flags: DIFlagPublic)
!10 = !DIDerivedType(tag: DW_TAG_member, name: "z", scope: !5, file: !1, line: 6, baseType: !8, size: 32, align: 32, offset: 64, flags: DIFlagPublic)
!11 = !DIDerivedType(tag: DW_TAG_member, name: "w", scope: !5, file: !1, line: 6, baseType: !8, size: 32, align: 32, offset: 96, flags: DIFlagPublic)
!12 = !{!13, !14}
!13 = !DITemplateTypeParameter(name: "element", type: !8)
!14 = !DITemplateValueParameter(name: "element_count", type: !15, value: i32 4)
!15 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!16 = !{!17}
!17 = !DISubprogram(name: "MainVS", scope: !1, file: !1, line: 9, type: !18, isLocal: false, isDefinition: true, scopeLine: 12, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @MainVS)
!18 = !DISubroutineType(types: !19)
!19 = !{null, !20}
!20 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !21)
!21 = !DIDerivedType(tag: DW_TAG_reference_type, baseType: !4)
!22 = !{!23, !27, !28, !39}
!23 = !DIGlobalVariable(name: "a", linkageName: "\01?a@Foo@@3$min16f@B", scope: !0, file: !1, line: 3, type: !24, isLocal: false, isDefinition: true)
!24 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !25)
!25 = !DIDerivedType(tag: DW_TAG_typedef, name: "min16float", file: !1, line: 3, baseType: !26)
!26 = !DIBasicType(name: "min16float", size: 16, align: 32, encoding: DW_ATE_float)
!27 = !DIGlobalVariable(name: "c", linkageName: "\01?c@Foo@@3$min16f@B", scope: !0, file: !1, line: 4, type: !24, isLocal: false, isDefinition: true)
!28 = !DIGlobalVariable(name: "e", linkageName: "\01?e@Foo@@3V?$vector@$min16f@$02@@B", scope: !0, file: !1, line: 5, type: !29, isLocal: false, isDefinition: true)
!29 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !30)
!30 = !DIDerivedType(tag: DW_TAG_typedef, name: "min16float3", file: !1, line: 5, baseType: !31)
!31 = !DICompositeType(tag: DW_TAG_class_type, name: "vector<min16float, 3>", file: !1, line: 5, size: 96, align: 32, elements: !32, templateParams: !36)
!32 = !{!33, !34, !35}
!33 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !31, file: !1, line: 5, baseType: !26, size: 16, align: 32, flags: DIFlagPublic)
!34 = !DIDerivedType(tag: DW_TAG_member, name: "y", scope: !31, file: !1, line: 5, baseType: !26, size: 16, align: 32, offset: 16, flags: DIFlagPublic)
!35 = !DIDerivedType(tag: DW_TAG_member, name: "z", scope: !31, file: !1, line: 5, baseType: !26, size: 16, align: 32, offset: 32, flags: DIFlagPublic)
!36 = !{!37, !38}
!37 = !DITemplateTypeParameter(name: "element", type: !26)
!38 = !DITemplateValueParameter(name: "element_count", type: !15, value: i32 3)
!39 = !DIGlobalVariable(name: "b", linkageName: "\01?b@Foo@@3MB", scope: !0, file: !1, line: 6, type: !40, isLocal: false, isDefinition: true)
!40 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !8)
!41 = !{i32 2, !"Dwarf Version", i32 4}
!42 = !{i32 2, !"Debug Info Version", i32 3}
!43 = !{!"dxc(private) 1.8.0.0 (private, 00000000)"}
!44 = !{!"VisibilityVertexShader.usf", !"cbuffer Foo\0D\0A{\0D\0A\09min16float a;\0D\0A\09min16float c;\0D\0A\09min16float3 e;\0D\0A\09float b;\0D\0A}\0D\0A\0D\0Avoid MainVS(\0D\0A           out float4 pos : SV_Position\0D\0A           )\0D\0A{\0D\0A\09pos = float4(e, a);\0D\0A}"}
!45 = !{!"VisibilityVertexShader.usf"}
!46 = !{!"-E", !"MainVS", !"-T", !"vs_6_0", !"/Zpr", !"/O1", !"/Zi", !"/HV", !"2016", !"/Fo", !"VisibilityVertexShader.dxil", !"-Qembed_debug"}
!47 = !{i32 1, i32 0}
!48 = !{i32 1, i32 8}
!49 = !{!"vs", i32 6, i32 0}
!50 = !{null, null, !51, null}
!51 = !{!52}
!52 = !{i32 0, %hostlayout.Foo* undef, !"Foo", i32 0, i32 0, i32 1, i32 32, null}
!53 = !{i32 0, %hostlayout.Foo undef, !54}
!54 = !{i32 32, !55, !56, !57, !58}
!55 = !{i32 6, !"a", i32 3, i32 0, i32 7, i32 8}
!56 = !{i32 6, !"c", i32 3, i32 4, i32 7, i32 8}
!57 = !{i32 6, !"e", i32 3, i32 16, i32 7, i32 8}
!58 = !{i32 6, !"b", i32 3, i32 28, i32 7, i32 9}
!59 = !{i32 1, void ()* @MainVS, !60}
!60 = !{!61}
!61 = !{i32 0, !2, !2}
!62 = !{[2 x i32] [i32 0, i32 4]}
!63 = !{void ()* @MainVS, !"MainVS", !64, !50, !69}
!64 = !{null, !65, null}
!65 = !{!66}
!66 = !{i32 0, !"SV_Position", i8 9, i8 3, !67, i8 4, i32 1, i8 4, i32 0, i8 0, !68}
!67 = !{i32 0}
!68 = !{i32 3, i32 15}
!69 = !{i32 0, i64 32}
!70 = !DILocation(line: 10, column: 23, scope: !17)
!71 = !DILocation(line: 13, column: 15, scope: !17)
!72 = !DILocation(line: 13, column: 18, scope: !17)
!73 = !DILocation(line: 13, column: 14, scope: !17)
!74 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "pos", arg: 1, scope: !17, file: !1, line: 10, type: !4)
!75 = !DIExpression(DW_OP_bit_piece, 0, 32)
!76 = !DILocation(line: 13, column: 6, scope: !17)
!77 = !DIExpression(DW_OP_bit_piece, 32, 32)
!78 = !DIExpression(DW_OP_bit_piece, 64, 32)
!79 = !DIExpression(DW_OP_bit_piece, 96, 32)
!80 = !DILocation(line: 14, column: 1, scope: !17)

Actual Behavior
the value from min16float3 is not as expected, only min16float3.x is correct, .yz is not correct caused by fetching data from wrong offset

Environment

DXC version 1.8(dev)
Host Operating System OS Name Microsoft Windows 11 Pro Version 10.0.22631 Build 22631

llvm-beanz · 2024-11-20T18:14:33Z

I'm not sure what you mean by:

when i test the shader on rtx 4080, the adress is caculated with index * 2byes(min16float size), is this a bug or am missing something?

min16float types are 32-bit float values when stored in memory, and the driver can (optionally) reduce them down to 16-bit types in the compiler for the running shader.

The DXIL you provided looks accurate to me, where each of the cbuffer elements is laid out as if the min16float values occupy 32-bits (because they do), and the compiler can load those into f16 values in the SSA form. Those f16 values are not of known size in DXIL (even though we call them f16), the driver decides what size to make them.

If instead you used half with -enable-16bit-types you would see f16 values that are 16-bit, there is a flag in the DXIL that denotes if f16 values are 16-bit or driver's choice.

Closing since this seems to all be correct. If you have further questions please re-open.

fengliancanxue · 2024-11-21T02:04:28Z

I'm not sure what you mean by:

when i test the shader on rtx 4080, the adress is caculated with index * 2byes(min16float size), is this a bug or am missing something?

min16float types are 32-bit float values when stored in memory, and the driver can (optionally) reduce them down to 16-bit types in the compiler for the running shader.

The DXIL you provided looks accurate to me, where each of the cbuffer elements is laid out as if the min16float values occupy 32-bits (because they do), and the compiler can load those into f16 values in the SSA form. Those f16 values are not of known size in DXIL (even though we call them f16), the driver decides what size to make them.

If instead you used half with -enable-16bit-types you would see f16 values that are 16-bit, there is a flag in the DXIL that denotes if f16 values are 16-bit or driver's choice.

Closing since this seems to all be correct. If you have further questions please re-open.

min16float types are 32-bit float values when stored in memory, and the driver can (optionally) reduce them down to 16-bit types in the compiler for the running shader.

Yeah, I also think so, but what i tested is not the case. for example

cbuffer Foo
{
    minfloat4 a;
}

when update the constants buffer, what i expected is to use a float4 value to update, but actually i have to use a float16_t4 value to update the constant buffer, It seems like that I enabled -enable-16bit-types, but as the dxil says

; Note: shader requires additional functionality:
;       Minimum-precision data types

which indicates that I didn't use -enable-16bit-types. What i expected is what you mentioned above, it has the same behavior as vulkan's relaxed precision. Or the driver did something that violates the spec?

What you mean by the driver can reduce them down to 16-bit types? read from and float and then convert float to float16_t?

Also: It says I don't have permission to open the issue.

fengliancanxue added bug Bug, regression, crash needs-triage Awaiting triage labels Nov 20, 2024

github-project-automation bot added this to HLSL Triage Nov 20, 2024

llvm-beanz closed this as not planned Won't fix, can't repro, duplicate, stale Nov 20, 2024

github-project-automation bot moved this to Triaged in HLSL Triage Nov 20, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

min16float3 support bug? #7013

min16float3 support bug? #7013

fengliancanxue commented Nov 20, 2024 •

edited

Loading

llvm-beanz commented Nov 20, 2024

fengliancanxue commented Nov 21, 2024

min16float3 support bug? #7013

min16float3 support bug? #7013

Comments

fengliancanxue commented Nov 20, 2024 • edited Loading

llvm-beanz commented Nov 20, 2024

fengliancanxue commented Nov 21, 2024

fengliancanxue commented Nov 20, 2024 •

edited

Loading