Skip to content

Inline gpu function calls #24673

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed

Conversation

DialecticalMaterialist
Copy link
Contributor

It seems that the stage2 compiler when targeting SPIR-V does yet not have that many optimizations such as function inlining and DCE. It produces a shell for the function which you called such as gpu.location and it also gets called.

By appending inline to these functions they are not generated in the emitted binary but only their instructions. Example below:

build.zig

const std = @import("std");
pub fn build(b: *std.Build) void {
    const spirv_target = b.resolveTargetQuery(.{
        .cpu_arch = .spirv64,
        .cpu_model = .{ .explicit = &std.Target.spirv.cpu.generic },
        .os_tag = .opengl,
        .ofmt = .spirv,
        .abi = .none,
    });

    const vertex_shader = b.addObject(.{
        .name = "tri_vert.zig",
        .root_module = b.createModule(.{
            .root_source_file = b.path("tri_vert.zig"),
            .target = spirv_target,
            .optimize = .ReleaseFast,
        }),
        .use_llvm = false,
    });

    b.getInstallStep().dependOn(&b.addInstallFile(vertex_shader.getEmittedBin(), "../tri_vert.spv").step);
}

tri_vert.zig

const gpu = @import("std").gpu;

extern const vertices: @Vector(3, f32) addrspace(.input);
export fn main() callconv(.spirv_vertex) void {
    gpu.location(&vertices, 0);

    gpu.position_out.* = .{ vertices[0], vertices[1], vertices[2], 1 };
}

Assembly output without inline:

               OpCapability Shader
               OpCapability Matrix
               OpCapability Int64
               OpCapability Int8
               OpCapability Int16
               OpMemoryModel Logical GLSL450
               OpEntryPoint Vertex %154 "main" %vertices %position
          %5 = OpString "tri_vert.zig"
         %48 = OpString "gpu.zig"
               OpSourceExtension "zig_errors:"
               OpSource Zig 0
               OpName %void "void"
               OpName %f32 "f32"
               OpName %u64 "u64"
               OpName %u32 "u32"
               OpName %tri_vert_main "tri_vert.main"
               OpName %gpu_location__anon_489 "gpu.location__anon_489"
               OpName %vertices "vertices"
               OpName %position "position"
               OpDecorate %_ptr_Input_v3f32 ArrayStride 16
               OpDecorate %_ptr_Function_v3f32 ArrayStride 16
               OpDecorate %_ptr_Function_f32 ArrayStride 4
               OpDecorate %_ptr_Output_f32 ArrayStride 4
               OpDecorate %_ptr_Output_v4f32 ArrayStride 16
               OpDecorate %vertices Location 0
               OpDecorate %position BuiltIn Position
       %void = OpTypeVoid
        %f32 = OpTypeFloat 32
      %v3f32 = OpTypeVector %f32 3
%_ptr_Input_v3f32 = OpTypePointer Input %v3f32
        %u64 = OpTypeInt 64 0
      %u64_0 = OpConstant %u64 0
%_ptr_Function_v3f32 = OpTypePointer Function %v3f32
%_ptr_Function_f32 = OpTypePointer Function %f32
%_ptr_Output_f32 = OpTypePointer Output %f32
      %v4f32 = OpTypeVector %f32 4
%_ptr_Output_v4f32 = OpTypePointer Output %v4f32
        %u32 = OpTypeInt 32 0
      %u32_0 = OpConstant %u32 0
      %u64_1 = OpConstant %u64 1
      %u32_1 = OpConstant %u32 1
      %u64_2 = OpConstant %u64 2
      %u32_2 = OpConstant %u32 2
      %u32_3 = OpConstant %u32 3
      %f32_1 = OpConstant %f32 1
   %vertices = OpVariable %_ptr_Input_v3f32 Input
   %position = OpVariable %_ptr_Output_v4f32 Output
        %155 = OpTypeFunction %void
%tri_vert_main = OpFunction %void None %155
          %4 = OpLabel
         %17 = OpVariable %_ptr_Function_v3f32 Function
         %29 = OpVariable %_ptr_Function_v3f32 Function
         %36 = OpVariable %_ptr_Function_v3f32 Function
               OpLine %5 5 17
          %6 = OpFunctionCall %void %gpu_location__anon_489
               OpLine %5 7 8
         %12 = OpLoad %v3f32 %vertices Aligned 16
               OpLine %5 7 37
               OpStore %17 %12
         %18 = OpInBoundsAccessChain %_ptr_Function_f32 %17 %u64_0
         %19 = OpLoad %f32 %18
         %26 = OpInBoundsAccessChain %_ptr_Output_f32 %position %u32_0
               OpStore %26 %19 None
         %27 = OpLoad %v3f32 %vertices Aligned 16
               OpLine %5 7 50
               OpStore %29 %27
         %30 = OpInBoundsAccessChain %_ptr_Function_f32 %29 %u64_1
         %31 = OpLoad %f32 %30
         %33 = OpInBoundsAccessChain %_ptr_Output_f32 %position %u32_1
               OpStore %33 %31 None
         %34 = OpLoad %v3f32 %vertices Aligned 16
               OpLine %5 7 63
               OpStore %36 %34
         %37 = OpInBoundsAccessChain %_ptr_Function_f32 %36 %u64_2
         %38 = OpLoad %f32 %37
         %40 = OpInBoundsAccessChain %_ptr_Output_f32 %position %u32_2
               OpStore %40 %38 None
         %42 = OpInBoundsAccessChain %_ptr_Output_f32 %position %u32_3
               OpStore %42 %f32_1 None
               OpReturn
         %44 = OpLabel
               OpUnreachable
               OpFunctionEnd
%gpu_location__anon_489 = OpFunction %void None %155
         %47 = OpLabel
               OpLine %48 26 5
               OpReturn
         %49 = OpLabel
               OpUnreachable
               OpFunctionEnd
        %154 = OpFunction %void None %155
        %176 = OpLabel
        %177 = OpFunctionCall %void %tri_vert_main
               OpReturn
               OpFunctionEnd

Assembly output with inline:

               OpCapability Shader
               OpCapability Matrix
               OpCapability Int64
               OpCapability Int8
               OpCapability Int16
               OpMemoryModel Logical GLSL450
               OpEntryPoint Vertex %153 "main" %ptr %position
          %5 = OpString "tri_vert.zig"
               OpSourceExtension "zig_errors:"
               OpSource Zig 0
               OpName %void "void"
               OpName %f32 "f32"
               OpName %ptr "ptr"
               OpName %u32 "u32"
               OpName %u64 "u64"
               OpName %tri_vert_main "tri_vert.main"
               OpName %ptr "vertices"
               OpName %position "position"
               OpDecorate %_ptr_Input_v3f32 ArrayStride 16
               OpDecorate %ptr Location 0
               OpDecorate %_ptr_Function_v3f32 ArrayStride 16
               OpDecorate %_ptr_Function_f32 ArrayStride 4
               OpDecorate %_ptr_Output_f32 ArrayStride 4
               OpDecorate %_ptr_Output_v4f32 ArrayStride 16
               OpDecorate %position BuiltIn Position
       %void = OpTypeVoid
        %f32 = OpTypeFloat 32
      %v3f32 = OpTypeVector %f32 3
%_ptr_Input_v3f32 = OpTypePointer Input %v3f32
        %u32 = OpTypeInt 32 0
      %u32_1 = OpConstant %u32 1
       %bool = OpTypeBool
        %u64 = OpTypeInt 64 0
      %u64_0 = OpConstant %u64 0
%_ptr_Function_v3f32 = OpTypePointer Function %v3f32
%_ptr_Function_f32 = OpTypePointer Function %f32
%_ptr_Output_f32 = OpTypePointer Output %f32
      %v4f32 = OpTypeVector %f32 4
%_ptr_Output_v4f32 = OpTypePointer Output %v4f32
      %u32_0 = OpConstant %u32 0
      %u64_1 = OpConstant %u64 1
      %u64_2 = OpConstant %u64 2
      %u32_2 = OpConstant %u32 2
      %u32_3 = OpConstant %u32 3
      %f32_1 = OpConstant %f32 1
        %ptr = OpVariable %_ptr_Input_v3f32 Input
   %position = OpVariable %_ptr_Output_v4f32 Output
        %154 = OpTypeFunction %void
%tri_vert_main = OpFunction %void None %154
          %4 = OpLabel
         %24 = OpVariable %_ptr_Function_v3f32 Function
         %35 = OpVariable %_ptr_Function_v3f32 Function
         %42 = OpVariable %_ptr_Function_v3f32 Function
               OpLine %5 13 13
               OpLine %5 5 5
               OpBranch %13
         %13 = OpLabel
         %15 = OpIEqual %bool %u32_1 %u32_1
               OpSelectionMerge %17 None
               OpBranchConditional %15 %18 %17
         %18 = OpLabel
               OpLine %5 15 8
         %19 = OpLoad %v3f32 %ptr Aligned 16
               OpLine %5 15 37
               OpStore %24 %19
         %25 = OpInBoundsAccessChain %_ptr_Function_f32 %24 %u64_0
         %26 = OpLoad %f32 %25
         %32 = OpInBoundsAccessChain %_ptr_Output_f32 %position %u32_0
               OpStore %32 %26 None
         %33 = OpLoad %v3f32 %ptr Aligned 16
               OpLine %5 15 50
               OpStore %35 %33
         %36 = OpInBoundsAccessChain %_ptr_Function_f32 %35 %u64_1
         %37 = OpLoad %f32 %36
         %39 = OpInBoundsAccessChain %_ptr_Output_f32 %position %u32_1
               OpStore %39 %37 None
         %40 = OpLoad %v3f32 %ptr Aligned 16
               OpLine %5 15 63
               OpStore %42 %40
         %43 = OpInBoundsAccessChain %_ptr_Function_f32 %42 %u64_2
         %44 = OpLoad %f32 %43
         %46 = OpInBoundsAccessChain %_ptr_Output_f32 %position %u32_2
               OpStore %46 %44 None
         %48 = OpInBoundsAccessChain %_ptr_Output_f32 %position %u32_3
               OpStore %48 %f32_1 None
               OpReturn
         %17 = OpLabel
               OpUnreachable
               OpFunctionEnd
        %153 = OpFunction %void None %154
        %175 = OpLabel
        %176 = OpFunctionCall %void %tri_vert_main
               OpReturn
               OpFunctionEnd

See how without inline it generates this function body:

%gpu_location__anon_489 = OpFunction %void None %155
         %47 = OpLabel
               OpLine %48 26 5
               OpReturn
         %49 = OpLabel
               OpUnreachable
               OpFunctionEnd

Aswell as a call to it:

%6 = OpFunctionCall %void %gpu_location__anon_489

It seems that the stage2 compiler when targetting spir-v does yet not
have that many optimizations such as function inlining and DCE.
Interestingly the function such as location get generated and gets a
function call but it has an empty body.

By appending inline to these functions they are not generated in the
emitted binary only their instructions.
@alexrp
Copy link
Member

alexrp commented Aug 3, 2025

cc @alichraghi

Copy link
Contributor

@alichraghi alichraghi left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think i tried doing this before but it didn't work at that time for some reason.
LGTM!

@alexrp alexrp enabled auto-merge (rebase) August 3, 2025 13:15
@alexrp alexrp disabled auto-merge August 3, 2025 18:10
@DialecticalMaterialist
Copy link
Contributor Author

I will close this in favor of my other PR #24681

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants