`@unroll` unrolling the wrong loop #411

lcw · 2023-07-22T22:24:00Z

This code

using KernelAbstractions
using KernelAbstractions.Extras: @unroll

@inline function g(a, i, ::Val{L}) where {L}
    @unroll for _ = 1:L
        @inbounds a[i] = i
    end
end

@kernel function f(a)
    i = @index(Global)
    g(a, i, Val(1))
end

a = zeros(1000)

kernel = f(CPU(), 256)
kernel(a, ndrange = length(a))

produces 6879 lines of commented LLVM IR for the CPU kernel. The typed kernel IR is

cpu_f(__ctx__, a) @ Main none:0
    ∘ ── %0 = invoke cpu_f(::CompilerMetadata{…},::Array{…})::Core.Const(nothing)
    1 ──       $(Expr(:aliasscope))::Any
273 2 ┄─ %2  = φ (#1 => $(QuoteNode(CartesianIndex(1,))), #15 => %27)::CartesianIndex{1}
    │    %3  = φ (#1 => 1, #15 => %29)::Int64
    └─── %4  = φ (#1 => 1, #15 => %28)::Int64
275 3 ── %5  = KernelAbstractions.__index_Global_Linear(__ctx__, %2)::Int64
276 4 ── %6  = Base.sitofp(Float64, %5)::Float64
    │          Base.arrayset(false, a, %6, %5)::Vector{Float64}
    └───       $(Expr(:loopinfo, (Symbol("llvm.loop.unroll.full"), 1)))::Nothing
    5 ──       goto #6
277 6 ── %10 = Base.add_int(%3, 1)::Int64
    │    %11 = Base.sle_int(1, %10)::Bool
    │    %12 = Base.sle_int(%10, 256)::Bool
    │    %13 = Base.and_int(%11, %12)::Bool
    └───       goto #8 if not %13
    7 ── %15 = (%4 === 256)::Bool
    │    %16 = Base.not_int(%15)::Bool
    └───       goto #9
    8 ──       nothing::Nothing
    9 ┄─ %19 = φ (#7 => %16, #8 => false)::Bool
    └───       goto #10
    10 ─       goto #13 if not %19
    11 ─       nothing::Nothing
    12 ─ %23 = Core.tuple(%10)::Tuple{Int64}
    │    %24 = %new(CartesianIndex{1}, %23)::CartesianIndex{1}
    └───       goto #14
    13 ─       goto #14
    14 ┄ %27 = φ (#12 => %24)::CartesianIndex{1}
    │    %28 = φ (#12 => %10)::Int64
    │    %29 = φ (#12 => %10)::Int64
    │    %30 = φ (#12 => false, #13 => true)::Bool
    │    %31 = Base.not_int(%30)::Bool
    └───       goto #16 if not %31
    15 ─       goto #2
    16 ─       $(Expr(:popaliasscope))::Any
    └───       return Main.nothing

If I remove the @unroll then the typed IR is the same except without $(Expr(:loopinfo, (Symbol("llvm.loop.unroll.full"), 1)))::Nothing but the commented LLVM IR is now 78 lines.

It looks like the loop in function g has been inlined without removing the :loopinfo so the loop inlining gets applied to the innerloop calling the kernel.

The text was updated successfully, but these errors were encountered:

lcw · 2023-07-22T22:31:43Z

This is a simpler MWE of the issue.

using KernelAbstractions.Extras: @unroll

@inline function g_unroll(a, i, ::Val{L}) where {L}
    @unroll for _ = 1:L
        @inbounds a[i] = i
    end
end

function f_unroll(a)
    for i = 1:1000
        g_unroll(a, i, Val(1))
    end
end

@inline function g_nounroll(a, i, ::Val{L}) where {L}
    for _ = 1:L
        @inbounds a[i] = i
    end
end

function f_nounroll(a)
    for i = 1:1000
        g_nounroll(a, i, Val(1))
    end
end

a = zeros(1000)

@code_typed f_nounroll(a)
@code_llvm f_nounroll(a)

@code_typed f_unroll(a)
@code_llvm f_unroll(a)

maleadt · 2023-07-23T12:06:58Z

Here's an IR MWE demonstrating the blowup + compilation hang, in case anybody wants to look at the LLVM side of this:

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:10:11:12:13"
target triple = "x86_64-unknown-linux-gnu"

define void @"julia_cpu_broadcast_kernel!_1269"(i64 addrspace(11)* %0, i8 addrspace(11)* %1, i1 %2, i64 addrspace(11)* %3, i64 %4, i8 addrspace(11)* %5, i64 addrspace(11)* %6, {} addrspace(10)* addrspace(11)* %7, {} addrspace(10)* addrspace(11)* %8, i64 addrspace(11)* %9, i8 addrspace(11)* %10, i64 addrspace(11)* %11, i64 addrspace(11)* %12) {
top.L3_crit_edge:
  br label %L3

L3:                                               ; preds = %L492, %top.L3_crit_edge
  %value_phi = phi i64 [ 0, %top.L3_crit_edge ], [ %4, %L492 ]
  %value_phi1 = phi i64 [ 0, %top.L3_crit_edge ], [ %59, %L492 ]
  %value_phi2 = phi i64 [ 0, %top.L3_crit_edge ], [ %value_phi1, %L492 ]
  br label %L62

L62:                                              ; preds = %L3
  %13 = sub i64 %value_phi, 1
  br label %pass

L77:                                              ; preds = %pass4
  br label %L164

L164:                                             ; preds = %L77
  %14 = select i1 %2, i64 %4, i64 %value_phi1
  %15 = load i8, i8 addrspace(11)* %5, align 1
  %16 = trunc i8 %15 to i1
  %17 = load i64, i64 addrspace(11)* %6, align 8
  %18 = select i1 %16, i64 %17, i64 0
  br label %L261

L261:                                             ; preds = %L164
  %19 = load atomic {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %7 unordered, align 8
  %20 = sub i64 %14, 1
  %21 = load i64, i64 addrspace(11)* %9, align 8
  %22 = mul i64 %18, %21
  %23 = add i64 %20, %22
  %24 = addrspacecast {} addrspace(10)* %19 to {} addrspace(11)*
  %25 = bitcast {} addrspace(11)* %24 to { i8 addrspace(13)*, i64, i16, i16, i32 } addrspace(11)*
  %26 = getelementptr inbounds { i8 addrspace(13)*, i64, i16, i16, i32 }, { i8 addrspace(13)*, i64, i16, i16, i32 } addrspace(11)* %25, i32 0, i32 0
  %27 = load i8 addrspace(13)*, i8 addrspace(13)* addrspace(11)* %26, align 8
  %28 = bitcast i8 addrspace(13)* %27 to double addrspace(13)*
  %29 = getelementptr inbounds double, double addrspace(13)* %28, i64 %23
  %30 = load double, double addrspace(13)* %29, align 8
  br label %L270

L270:                                             ; preds = %L261
  %31 = select i1 %2, i64 %4, i64 %65
  %32 = load i8, i8 addrspace(11)* %10, align 1
  %33 = trunc i8 %32 to i1
  %34 = load i64, i64 addrspace(11)* %12, align 8
  %35 = select i1 %33, i64 %34, i64 0
  %36 = load i8, i8 addrspace(11)* %1, align 1
  %37 = trunc i8 %36 to i1
  %38 = load i64, i64 addrspace(11)* %11, align 8
  %39 = select i1 %37, i64 %38, i64 0
  br label %L366

L366:                                             ; preds = %L270
  %40 = sub i64 %31, 1
  %41 = load i64, i64 addrspace(11)* %3, align 8
  %42 = sub i64 %35, 1
  %43 = mul i64 %42, %41
  %44 = add i64 %40, %43
  %45 = load i64, i64 addrspace(11)* %0, align 8
  %46 = mul i64 %4, %45
  %47 = sub i64 %39, 1
  %48 = mul i64 %47, %46
  %49 = add i64 %44, %48
  %50 = getelementptr inbounds double, double addrspace(13)* null, i64 %49
  %51 = load double, double addrspace(13)* %50, align 8
  br label %L377

L377:                                             ; preds = %L366
  %52 = fsub double %30, %51
  br label %L465

L465:                                             ; preds = %L377
  %53 = load atomic {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %8 unordered, align 8
  %54 = addrspacecast {} addrspace(10)* %53 to {} addrspace(11)*
  %55 = bitcast {} addrspace(11)* %54 to { i8 addrspace(13)*, i64, i16, i16, i32 } addrspace(11)*
  %56 = getelementptr inbounds { i8 addrspace(13)*, i64, i16, i16, i32 }, { i8 addrspace(13)*, i64, i16, i16, i32 } addrspace(11)* %55, i32 0, i32 0
  %57 = load i8 addrspace(13)*, i8 addrspace(13)* addrspace(11)* %56, align 8
  %58 = bitcast i8 addrspace(13)* %57 to double addrspace(13)*
  store double %52, double addrspace(13)* %58, align 8
  br label %L468

L468:                                             ; preds = %L465
  call void @julia.loopinfo_marker(), !julia.loopinfo !0
  br label %L472

L472:                                             ; preds = %L468
  %59 = add i64 %value_phi1, 1
  br label %L477

L477:                                             ; preds = %L472
  %60 = icmp eq i64 %value_phi2, 256
  br label %L483

L483:                                             ; preds = %L477
  br i1 %60, label %L485, label %L484

L484:                                             ; preds = %L483
  br label %L486

L485:                                             ; preds = %L483
  br label %L486

L486:                                             ; preds = %L485, %L484
  %value_phi9 = phi i8 [ 0, %L484 ], [ 1, %L485 ]
  %61 = trunc i8 %value_phi9 to i1
  br i1 %61, label %L493, label %L492

L492:                                             ; preds = %L486
  br label %L3

L493:                                             ; preds = %L486
  ret void

pass:                                             ; preds = %L62
  %62 = sdiv i64 %13, %4
  %63 = mul i64 %4, %62
  %64 = sub i64 0, %63
  %65 = add i64 %64, 1
  br label %pass4

pass4:                                            ; preds = %pass
  br label %L77
}

declare void @julia.loopinfo_marker()

!0 = !{!1}
!1 = !{!"llvm.loop.unroll.full", i64 1}

Optimizing this IR (./usr/tools/opt --opaque-pointers=0 --load-pass-plugin=libjulia-codegen.so -julia slow.ll -o bug.bc) yields IR with a single basic block counting 10000 statements. Further compiling that IR (time ./usr/tools/llc bug.bc) takes over 10 seconds, which was the threshold I used when reducing the original IR (which took minutes to compile).

vchuravy · 2023-07-24T15:31:24Z

So when we emit the IR it still looks right.

**julia> @code_llvm optimize=false f_unroll(a)
;  @ REPL[3]:1 within `f_unroll`
define void @julia_f_unroll_449({}* noundef nonnull align 16 dereferenceable(40) %0) #0 {
top:
  %a = alloca {}*, align 8
  %1 = call {}*** @julia.get_pgcstack()
  store {}* null, {}** %a, align 8
  %2 = bitcast {}*** %1 to {}**
  %current_task = getelementptr inbounds {}*, {}** %2, i64 -13
  %3 = bitcast {}** %current_task to i64*
  %world_age = getelementptr inbounds i64, i64* %3, i64 14
  store {}* %0, {}** %a, align 8
;  @ REPL[3]:2 within `f_unroll`
  br i1 false, label %L20, label %top.L2_crit_edge

top.L2_crit_edge:                                 ; preds = %top
  br label %L2

L2:                                               ; preds = %L19, %top.L2_crit_edge
  %value_phi = phi i64 [ 1, %top.L2_crit_edge ], [ %value_phi2, %L19 ]
  %value_phi1 = phi i64 [ 1, %top.L2_crit_edge ], [ %value_phi3, %L19 ]
;  @ REPL[3]:3 within `f_unroll`
; ┌ @ REPL[2]:2 within `g_unroll`
; │┌ @ /home/vchuravy/.julia/packages/KernelAbstractions/DI1ni/src/extras/loopinfo.jl:25 within `macro expansion`
    br i1 false, label %L8, label %L5

L5:                                               ; preds = %L2
; ││ @ /home/vchuravy/.julia/packages/KernelAbstractions/DI1ni/src/extras/loopinfo.jl:26 within `macro expansion`
; ││┌ @ array.jl:969 within `setindex!`
; │││┌ @ number.jl:7 within `convert`
; ││││┌ @ float.jl:159 within `Float64`
       %4 = sitofp i64 %value_phi to double
; │││└└
     %5 = load {}*, {}** %a, align 8
     %6 = sub i64 %value_phi, 1
     %7 = mul i64 %6, 1
     %8 = add i64 0, %7
     %9 = bitcast {}* %5 to { i8*, i64, i16, i16, i32 }*
     %10 = getelementptr inbounds { i8*, i64, i16, i16, i32 }, { i8*, i64, i16, i16, i32 }* %9, i32 0, i32 0
     %11 = load i8*, i8** %10, align 8
     %12 = bitcast i8* %11 to double*
     %13 = getelementptr inbounds double, double* %12, i64 %8
     store double %4, double* %13, align 8
; ││└
; ││ @ /home/vchuravy/.julia/packages/KernelAbstractions/DI1ni/src/extras/loopinfo.jl:27 within `macro expansion`
    call void @julia.loopinfo_marker()
    br label %L8

L8:                                               ; preds = %L5, %L2
; │└
   br label %L9

L9:                                               ; preds = %L8
; └
;  @ REPL[3]:4 within `f_unroll`
; ┌ @ range.jl:891 within `iterate`
; │┌ @ promotion.jl:499 within `==`
    %14 = icmp eq i64 %value_phi1, 1000
    %15 = zext i1 %14 to i8
; │└
   %16 = trunc i8 %15 to i1
   %17 = xor i1 %16, true
   br i1 %17, label %L12, label %L11

L11:                                              ; preds = %L9
   br label %L14

L12:  julia> @code_llvm optimize=false f_unroll(a)
;  @ REPL[3]:1 within `f_unroll`
define void @julia_f_unroll_449({}* noundef nonnull align 16 dereferenceable(40) %0) #0 {
top:
  %a = alloca {}*, align 8
  %1 = call {}*** @julia.get_pgcstack()
  store {}* null, {}** %a, align 8
  %2 = bitcast {}*** %1 to {}**
  %current_task = getelementptr inbounds {}*, {}** %2, i64 -13
  %3 = bitcast {}** %current_task to i64*
  %world_age = getelementptr inbounds i64, i64* %3, i64 14
  store {}* %0, {}** %a, align 8
;  @ REPL[3]:2 within `f_unroll`
  br i1 false, label %L20, label %top.L2_crit_edge

top.L2_crit_edge:                                 ; preds = %top
  br label %L2

L2:                                               ; preds = %L19, %top.L2_crit_edge
  %value_phi = phi i64 [ 1, %top.L2_crit_edge ], [ %value_phi2, %L19 ]
  %value_phi1 = phi i64 [ 1, %top.L2_crit_edge ], [ %value_phi3, %L19 ]
;  @ REPL[3]:3 within `f_unroll`
; ┌ @ REPL[2]:2 within `g_unroll`
; │┌ @ /home/vchuravy/.julia/packages/KernelAbstractions/DI1ni/src/extras/loopinfo.jl:25 within `macro expansion`
    br i1 false, label %L8, label %L5

L5:                                               ; preds = %L2
; ││ @ /home/vchuravy/.julia/packages/KernelAbstractions/DI1ni/src/extras/loopinfo.jl:26 within `macro expansion`
; ││┌ @ array.jl:969 within `setindex!`
; │││┌ @ number.jl:7 within `convert`
; ││││┌ @ float.jl:159 within `Float64`
       %4 = sitofp i64 %value_phi to double
; │││└└
     %5 = load {}*, {}** %a, align 8
     %6 = sub i64 %value_phi, 1
     %7 = mul i64 %6, 1
     %8 = add i64 0, %7
     %9 = bitcast {}* %5 to { i8*, i64, i16, i16, i32 }*
     %10 = getelementptr inbounds { i8*, i64, i16, i16, i32 }, { i8*, i64, i16, i16, i32 }* %9, i32 0, i32 0
     %11 = load i8*, i8** %10, align 8
     %12 = bitcast i8* %11 to double*
     %13 = getelementptr inbounds double, double* %12, i64 %8
     store double %4, double* %13, align 8
; ││└
; ││ @ /home/vchuravy/.julia/packages/KernelAbstractions/DI1ni/src/extras/loopinfo.jl:27 within `macro expansion`
    call void @julia.loopinfo_marker()
    br label %L8

L8:                                               ; preds = %L5, %L2
; │└
   br label %L9

L9:                                               ; preds = %L8
; └
;  @ REPL[3]:4 within `f_unroll`
; ┌ @ range.jl:891 within `iterate`
; │┌ @ promotion.jl:499 within `==`
    %14 = icmp eq i64 %value_phi1, 1000
    %15 = zext i1 %14 to i8
; │└
   %16 = trunc i8 %15 to i1
   %17 = xor i1 %16, true
   br i1 %17, label %L12, label %L11

L11:                                              ; preds = %L9
   br label %L14

L12:                                              ; preds = %L9
; │ @ range.jl:892 within `iterate`
; │┌ @ int.jl:87 within `+`
    %18 = add i64 %value_phi1, 1
; │└
; │ @ range.jl:891 within `iterate`
   br label %L14

L14:                                              ; preds = %L12, %L11
   %value_phi2 = phi i64 [ %18, %L12 ], [ undef, %L11 ]
   %value_phi3 = phi i64 [ %18, %L12 ], [ undef, %L11 ]
   %value_phi4 = phi i8 [ 1, %L11 ], [ 0, %L12 ]
; └
  %19 = trunc i8 %value_phi4 to i1
  %20 = xor i1 %19, true
  %21 = zext i1 %20 to i8
  %22 = trunc i8 %21 to i1
  %23 = xor i1 %22, true
  br i1 %23, label %L20, label %L19

L19:                                              ; preds = %L14
  br label %L2

L20:                                              ; preds = %L14, %top
  ret void
}
                                            ; preds = %L9
; │ @ range.jl:892 within `iterate`
; │┌ @ int.jl:87 within `+`
    %18 = add i64 %value_phi1, 1
; │└
; │ @ range.jl:891 within `iterate`
   br label %L14

L14:                                              ; preds = %L12, %L11
   %value_phi2 = phi i64 [ %18, %L12 ], [ undef, %L11 ]
   %value_phi3 = phi i64 [ %18, %L12 ], [ undef, %L11 ]
   %value_phi4 = phi i8 [ 1, %L11 ], [ 0, %L12 ]
; └
  %19 = trunc i8 %value_phi4 to i1
  %20 = xor i1 %19, true
  %21 = zext i1 %20 to i8
  %22 = trunc i8 %21 to i1
  %23 = xor i1 %22, true
  br i1 %23, label %L20, label %L19

L19:                                              ; preds = %L14
  br label %L2

L20:                                              ; preds = %L14, %top
  ret void
}

vchuravy · 2023-07-24T15:43:59Z

Okay in your simple MWE the issue is that simplifycfg removes the inner loop structure... and then the julia-simdloop attaches the information to the outer loop.

lcw · 2023-07-24T15:55:30Z

So is it possible to teach simplifycfg to remove the loopinfo when it removes the inner loop structure?

vchuravy · 2023-07-24T18:19:08Z

No I think we need to fix julia-simdloop . Giving that a go.

vchuravy · 2023-07-24T22:21:42Z

Closing this here since it is a upstream bug.

lcw mentioned this issue Jul 22, 2023

Unroll GridArray setindex! with @nexprs HorribleSanity/Raven.jl#44

Merged

vchuravy mentioned this issue Jul 24, 2023

julia.loopinfo_marker can be attaced to wrong loop. JuliaLang/julia#50660

Closed

vchuravy closed this as not planned Won't fix, can't repro, duplicate, stale Jul 24, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

`@unroll` unrolling the wrong loop #411

`@unroll` unrolling the wrong loop #411

lcw commented Jul 22, 2023

lcw commented Jul 22, 2023

maleadt commented Jul 23, 2023

vchuravy commented Jul 24, 2023

vchuravy commented Jul 24, 2023

lcw commented Jul 24, 2023

vchuravy commented Jul 24, 2023

vchuravy commented Jul 24, 2023

@unroll unrolling the wrong loop #411

@unroll unrolling the wrong loop #411

Comments

lcw commented Jul 22, 2023

lcw commented Jul 22, 2023

maleadt commented Jul 23, 2023

vchuravy commented Jul 24, 2023

vchuravy commented Jul 24, 2023

lcw commented Jul 24, 2023

vchuravy commented Jul 24, 2023

vchuravy commented Jul 24, 2023

`@unroll` unrolling the wrong loop #411

`@unroll` unrolling the wrong loop #411