-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Memory stores aren't vectorised in a for
loop unless explicit at-inbounds is used
#70
Comments
I'm so confused. On JuliaLang/julia@dc34428 I get: julia> code_llvm((Memory{Float64},)) do v
for idx in eachindex(v)
v[idx] = 1.0
end
end ; Function Signature: var"#2"(Memory{Float64})
; @ REPL[1]:2 within `#2`
define void @"julia_#2_1650"(ptr noundef nonnull align 8 dereferenceable(16) %"v::GenericMemory") #0 {
top:
%pgcstack = call ptr inttoptr (i64 4335582988 to ptr)(i64 4335583024) #9
; ┌ @ range.jl:911 within `iterate`
; │┌ @ range.jl:688 within `isempty`
; ││┌ @ operators.jl:425 within `>`
; │││┌ @ int.jl:83 within `<`
%.unbox = load i64, ptr %"v::GenericMemory", align 8
%0 = icmp slt i64 %.unbox, 1
; └└└└
br i1 %0, label %L29, label %mainloop
mainloop: ; preds = %top
%memory_data_ptr = getelementptr inbounds { i64, ptr }, ptr %"v::GenericMemory", i64 0, i32 1
%memoryref_data = load ptr, ptr %memory_data_ptr, align 8
%1 = shl nuw nsw i64 %.unbox, 1
%memoryref_bytelen = shl i64 %.unbox, 3
; @ REPL[1]:3 within `#2`
; ┌ @ genericmemory.jl:240 within `setindex!`
%smin26 = call i64 @llvm.smin.i64(i64 %.unbox, i64 %1)
%2 = sub nsw i64 %1, %smin26
%exit.mainloop.at = call i64 @llvm.umin.i64(i64 %.unbox, i64 %2)
%.not = icmp eq i64 %1, %smin26
br i1 %.not, label %postloop, label %L11.preheader
L11.preheader: ; preds = %mainloop
%3 = and i64 %.unbox, 2305843009213693951
%umax = call i64 @llvm.umax.i64(i64 %exit.mainloop.at, i64 1)
%4 = add nsw i64 %umax, -1
%umin = call i64 @llvm.umin.i64(i64 %3, i64 %4)
%min.iters.check = icmp ult i64 %umin, 8
br i1 %min.iters.check, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %L11.preheader
%5 = add nuw nsw i64 %umin, 1
%n.mod.vf = and i64 %5, 7
%6 = icmp eq i64 %n.mod.vf, 0
%7 = select i1 %6, i64 8, i64 %n.mod.vf
%n.vec = sub nsw i64 %5, %7
%ind.end = add nsw i64 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%8 = phi i64 [ 1, %vector.ph ], [ %15, %vector.body ]
; │ @ genericmemory.jl:241 within `setindex!`
%9 = shl i64 %8, 3
%10 = getelementptr i8, ptr %memoryref_data, i64 %9
%11 = getelementptr i8, ptr %10, i64 -8
%12 = getelementptr i8, ptr %10, i64 8
%13 = getelementptr i8, ptr %10, i64 24
%14 = getelementptr i8, ptr %10, i64 40
store <2 x i64> <i64 4607182418800017408, i64 4607182418800017408>, ptr %11, align 8
store <2 x i64> <i64 4607182418800017408, i64 4607182418800017408>, ptr %12, align 8
store <2 x i64> <i64 4607182418800017408, i64 4607182418800017408>, ptr %13, align 8
store <2 x i64> <i64 4607182418800017408, i64 4607182418800017408>, ptr %14, align 8
%index.next = add nuw i64 %index, 8
%15 = add i64 %8, 8
%16 = icmp eq i64 %index.next, %n.vec
br i1 %16, label %scalar.ph, label %vector.body
scalar.ph: ; preds = %vector.body, %L11.preheader
%bc.resume.val = phi i64 [ 1, %L11.preheader ], [ %ind.end, %vector.body ]
; │ @ genericmemory.jl:240 within `setindex!`
br label %L11
L11: ; preds = %idxend, %scalar.ph
%value_phi3 = phi i64 [ %17, %idxend ], [ %bc.resume.val, %scalar.ph ]
%memoryref_offset = shl i64 %value_phi3, 3
%memoryref_byteoffset = add i64 %memoryref_offset, -8
%memoryref_isinbounds = icmp ult i64 %memoryref_byteoffset, %memoryref_bytelen
br i1 %memoryref_isinbounds, label %idxend, label %oob
L29: ; preds = %idxend.postloop, %main.exit.selector, %top
; └
; @ REPL[1]:4 within `#2`
ret void
oob: ; preds = %L11.postloop, %L11
%value_phi3.lcssa = phi i64 [ %value_phi3.postloop, %L11.postloop ], [ %value_phi3, %L11 ]
; @ REPL[1]:3 within `#2`
; ┌ @ genericmemory.jl:240 within `setindex!`
%ptls_field = getelementptr inbounds i8, ptr %pgcstack, i64 16
%ptls_load = load ptr, ptr %ptls_field, align 8
%"box::GenericMemoryRef" = call noalias nonnull align 8 dereferenceable(32) ptr @ijl_gc_small_alloc(ptr %ptls_load, i32 616, i32 32, i64 4689350496) #8
%"box::GenericMemoryRef.tag_addr" = getelementptr inbounds i64, ptr %"box::GenericMemoryRef", i64 -1
store atomic i64 4689350496, ptr %"box::GenericMemoryRef.tag_addr" unordered, align 8
store ptr %memoryref_data, ptr %"box::GenericMemoryRef", align 8
%.repack17 = getelementptr inbounds { ptr, ptr }, ptr %"box::GenericMemoryRef", i64 0, i32 1
store ptr %"v::GenericMemory", ptr %.repack17, align 8
call void @ijl_bounds_error_int(ptr nonnull %"box::GenericMemoryRef", i64 %value_phi3.lcssa)
unreachable
idxend: ; preds = %L11
; │ @ genericmemory.jl:241 within `setindex!`
%memoryref_data11 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset
store i64 4607182418800017408, ptr %memoryref_data11, align 8
; └
; @ REPL[1]:4 within `#2`
; ┌ @ range.jl:915 within `iterate`
%17 = add nuw nsw i64 %value_phi3, 1
; └
%.not35 = icmp ult i64 %value_phi3, %exit.mainloop.at
br i1 %.not35, label %L11, label %main.exit.selector
main.exit.selector: ; preds = %idxend
%18 = icmp ult i64 %value_phi3, %.unbox
br i1 %18, label %postloop, label %L29
postloop: ; preds = %main.exit.selector, %mainloop
%value_phi3.copy = phi i64 [ 1, %mainloop ], [ %17, %main.exit.selector ]
br label %L11.postloop
L11.postloop: ; preds = %idxend.postloop, %postloop
%value_phi3.postloop = phi i64 [ %20, %idxend.postloop ], [ %value_phi3.copy, %postloop ]
; @ REPL[1]:3 within `#2`
; ┌ @ genericmemory.jl:240 within `setindex!`
%memoryref_offset.postloop = add nsw i64 %value_phi3.postloop, -1
%19 = add nuw nsw i64 %.unbox, %memoryref_offset.postloop
%memoryref_ovflw.not.postloop = icmp ult i64 %19, %1
%memoryref_byteoffset.postloop = shl i64 %memoryref_offset.postloop, 3
%memoryref_isinbounds.postloop = icmp ult i64 %memoryref_byteoffset.postloop, %memoryref_bytelen
%"memoryref_isinbounds¬ovflw.postloop" = and i1 %memoryref_ovflw.not.postloop, %memoryref_isinbounds.postloop
br i1 %"memoryref_isinbounds¬ovflw.postloop", label %idxend.postloop, label %oob
idxend.postloop: ; preds = %L11.postloop
; │ @ genericmemory.jl:241 within `setindex!`
%memoryref_data11.postloop = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.postloop
store i64 4607182418800017408, ptr %memoryref_data11.postloop, align 8
; └
; @ REPL[1]:4 within `#2`
; ┌ @ range.jl:915 within `iterate`
; │┌ @ promotion.jl:639 within `==`
%.not.not.postloop = icmp eq i64 %value_phi3.postloop, %.unbox
; │└
%20 = add nuw nsw i64 %value_phi3.postloop, 1
; └
br i1 %.not.not.postloop, label %L29, label %L11.postloop
} For a plain |
This issue is fixed by...reverting #68, and the solution was already in the LLVM IR shown at #65 (comment) 😕 It sounds like we can't have both bounds checking elision and vectorised stores. |
That sounds like a phase ordering problem. |
Follow up from #68 (comment):
I believe this is entirely due to the fact we're using
Memory
instead ofMemoryRef
, andMemoryRef
is somehow better optimised for stores (it doesn't need to callmemoryrefnew(mem)
). As a proof of concept (don't take this code seriously!), with this patch:I get
which is pretty much what you'd get for
Vector
s. It looks like usingMemory
instead ofMemoryRef
adds an extra layer of indirection, no idea where to go from here. CC: @oscardssmith who may have opinions about this (I already checked, JuliaLang/julia#55913 doesn't change anything here)The text was updated successfully, but these errors were encountered: