-
Notifications
You must be signed in to change notification settings - Fork 66
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
@unroll
unrolling the wrong loop
#411
Comments
This is a simpler MWE of the issue. using KernelAbstractions.Extras: @unroll
@inline function g_unroll(a, i, ::Val{L}) where {L}
@unroll for _ = 1:L
@inbounds a[i] = i
end
end
function f_unroll(a)
for i = 1:1000
g_unroll(a, i, Val(1))
end
end
@inline function g_nounroll(a, i, ::Val{L}) where {L}
for _ = 1:L
@inbounds a[i] = i
end
end
function f_nounroll(a)
for i = 1:1000
g_nounroll(a, i, Val(1))
end
end
a = zeros(1000)
@code_typed f_nounroll(a)
@code_llvm f_nounroll(a)
@code_typed f_unroll(a)
@code_llvm f_unroll(a) |
Here's an IR MWE demonstrating the blowup + compilation hang, in case anybody wants to look at the LLVM side of this: target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:10:11:12:13"
target triple = "x86_64-unknown-linux-gnu"
define void @"julia_cpu_broadcast_kernel!_1269"(i64 addrspace(11)* %0, i8 addrspace(11)* %1, i1 %2, i64 addrspace(11)* %3, i64 %4, i8 addrspace(11)* %5, i64 addrspace(11)* %6, {} addrspace(10)* addrspace(11)* %7, {} addrspace(10)* addrspace(11)* %8, i64 addrspace(11)* %9, i8 addrspace(11)* %10, i64 addrspace(11)* %11, i64 addrspace(11)* %12) {
top.L3_crit_edge:
br label %L3
L3: ; preds = %L492, %top.L3_crit_edge
%value_phi = phi i64 [ 0, %top.L3_crit_edge ], [ %4, %L492 ]
%value_phi1 = phi i64 [ 0, %top.L3_crit_edge ], [ %59, %L492 ]
%value_phi2 = phi i64 [ 0, %top.L3_crit_edge ], [ %value_phi1, %L492 ]
br label %L62
L62: ; preds = %L3
%13 = sub i64 %value_phi, 1
br label %pass
L77: ; preds = %pass4
br label %L164
L164: ; preds = %L77
%14 = select i1 %2, i64 %4, i64 %value_phi1
%15 = load i8, i8 addrspace(11)* %5, align 1
%16 = trunc i8 %15 to i1
%17 = load i64, i64 addrspace(11)* %6, align 8
%18 = select i1 %16, i64 %17, i64 0
br label %L261
L261: ; preds = %L164
%19 = load atomic {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %7 unordered, align 8
%20 = sub i64 %14, 1
%21 = load i64, i64 addrspace(11)* %9, align 8
%22 = mul i64 %18, %21
%23 = add i64 %20, %22
%24 = addrspacecast {} addrspace(10)* %19 to {} addrspace(11)*
%25 = bitcast {} addrspace(11)* %24 to { i8 addrspace(13)*, i64, i16, i16, i32 } addrspace(11)*
%26 = getelementptr inbounds { i8 addrspace(13)*, i64, i16, i16, i32 }, { i8 addrspace(13)*, i64, i16, i16, i32 } addrspace(11)* %25, i32 0, i32 0
%27 = load i8 addrspace(13)*, i8 addrspace(13)* addrspace(11)* %26, align 8
%28 = bitcast i8 addrspace(13)* %27 to double addrspace(13)*
%29 = getelementptr inbounds double, double addrspace(13)* %28, i64 %23
%30 = load double, double addrspace(13)* %29, align 8
br label %L270
L270: ; preds = %L261
%31 = select i1 %2, i64 %4, i64 %65
%32 = load i8, i8 addrspace(11)* %10, align 1
%33 = trunc i8 %32 to i1
%34 = load i64, i64 addrspace(11)* %12, align 8
%35 = select i1 %33, i64 %34, i64 0
%36 = load i8, i8 addrspace(11)* %1, align 1
%37 = trunc i8 %36 to i1
%38 = load i64, i64 addrspace(11)* %11, align 8
%39 = select i1 %37, i64 %38, i64 0
br label %L366
L366: ; preds = %L270
%40 = sub i64 %31, 1
%41 = load i64, i64 addrspace(11)* %3, align 8
%42 = sub i64 %35, 1
%43 = mul i64 %42, %41
%44 = add i64 %40, %43
%45 = load i64, i64 addrspace(11)* %0, align 8
%46 = mul i64 %4, %45
%47 = sub i64 %39, 1
%48 = mul i64 %47, %46
%49 = add i64 %44, %48
%50 = getelementptr inbounds double, double addrspace(13)* null, i64 %49
%51 = load double, double addrspace(13)* %50, align 8
br label %L377
L377: ; preds = %L366
%52 = fsub double %30, %51
br label %L465
L465: ; preds = %L377
%53 = load atomic {} addrspace(10)*, {} addrspace(10)* addrspace(11)* %8 unordered, align 8
%54 = addrspacecast {} addrspace(10)* %53 to {} addrspace(11)*
%55 = bitcast {} addrspace(11)* %54 to { i8 addrspace(13)*, i64, i16, i16, i32 } addrspace(11)*
%56 = getelementptr inbounds { i8 addrspace(13)*, i64, i16, i16, i32 }, { i8 addrspace(13)*, i64, i16, i16, i32 } addrspace(11)* %55, i32 0, i32 0
%57 = load i8 addrspace(13)*, i8 addrspace(13)* addrspace(11)* %56, align 8
%58 = bitcast i8 addrspace(13)* %57 to double addrspace(13)*
store double %52, double addrspace(13)* %58, align 8
br label %L468
L468: ; preds = %L465
call void @julia.loopinfo_marker(), !julia.loopinfo !0
br label %L472
L472: ; preds = %L468
%59 = add i64 %value_phi1, 1
br label %L477
L477: ; preds = %L472
%60 = icmp eq i64 %value_phi2, 256
br label %L483
L483: ; preds = %L477
br i1 %60, label %L485, label %L484
L484: ; preds = %L483
br label %L486
L485: ; preds = %L483
br label %L486
L486: ; preds = %L485, %L484
%value_phi9 = phi i8 [ 0, %L484 ], [ 1, %L485 ]
%61 = trunc i8 %value_phi9 to i1
br i1 %61, label %L493, label %L492
L492: ; preds = %L486
br label %L3
L493: ; preds = %L486
ret void
pass: ; preds = %L62
%62 = sdiv i64 %13, %4
%63 = mul i64 %4, %62
%64 = sub i64 0, %63
%65 = add i64 %64, 1
br label %pass4
pass4: ; preds = %pass
br label %L77
}
declare void @julia.loopinfo_marker()
!0 = !{!1}
!1 = !{!"llvm.loop.unroll.full", i64 1} Optimizing this IR ( |
So when we emit the IR it still looks right.
|
Okay in your simple MWE the issue is that simplifycfg removes the inner loop structure... and then the |
So is it possible to teach simplifycfg to remove the loopinfo when it removes the inner loop structure? |
No I think we need to fix |
Closing this here since it is a upstream bug. |
This code
produces 6879 lines of commented LLVM IR for the CPU kernel. The typed kernel IR is
If I remove the
@unroll
then the typed IR is the same except without$(Expr(:loopinfo, (Symbol("llvm.loop.unroll.full"), 1)))::Nothing
but the commented LLVM IR is now 78 lines.It looks like the loop in function
g
has been inlined without removing the:loopinfo
so the loop inlining gets applied to the innerloop calling the kernel.The text was updated successfully, but these errors were encountered: