-
-
Notifications
You must be signed in to change notification settings - Fork 5.7k
Open
Labels
performanceMust go fasterMust go faster
Description
Consider
function copyto_odd1!(dst::Vector{T}, src::Vector{T}) where {T}
length(src) == length(dst) || error()
@simd for i ∈ 1:2:length(src)
@inbounds src_i = src[i]
@inbounds dst[i] = src_i
end
dst
endand
function copyto_odd2!(dst::Vector{T}, src::Vector{T}) where {T}
@simd for i ∈ eachindex(src, dst)
if isodd(i)
@inbounds src_i = src[i]
@inbounds dst[i] = src_i
end
end
dst
endI find a substantial performance difference between these two functions:
julia> let
dst = rand(800)
src = rand(800)
print("copyto_odd1!: "); @btime copyto_odd1!($dst, $src)
print("copyto_odd2!: "); @btime copyto_odd2!($dst, $src)
end;
copyto_odd1!: 251.667 ns (0 allocations: 0 bytes)
copyto_odd2!: 42.977 ns (0 allocations: 0 bytes)and it seems to be about how the vectorization opportunities are found and implemented. As pointed out in a Zulip conversation, it seems that copyto_odd1! ends up using masked.gather/masked.scatter instructions whereas copyto_odd2! uses masked.load and masked.store instructions.
I get that this is likely a problem on LLVMs end, but I'm wondering if there's ways we can nudge codegen towards generating LLVM code that's more likely to properly vectorize here.
Edit: I accidentally wrote the body of copyto_odd2! without the if statement, now fixed
Edit2:
I tested this on 1.12.1 and nightly, using this machine:
julia> versioninfo()
Julia Version 1.12.1
Commit ba1e628ee49 (2025-10-17 13:02 UTC)
Build Info:
Official https://julialang.org release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 16 × AMD Ryzen 7 7840U w/ Radeon 780M Graphics
WORD_SIZE: 64
LLVM: libLLVM-18.1.7 (ORCJIT, znver4)
GC: Built with stock GC
Threads: 8 default, 1 interactive, 8 GC (on 16 virtual cores)
Environment:
JULIA_NUM_THREADS = 8
JULIA_EDITOR = emacsclient
Here's the code_llvm I get on 1.12.1:
copyto_odd1!
julia> code_llvm(copyto_odd1!, Tuple{Vector{Float64}, Vector{Float64}})
; Function Signature: copyto_odd1!(Array{Float64, 1}, Array{Float64, 1})
; @ REPL[2]:1 within `copyto_odd1!`
define nonnull ptr @"julia_copyto_odd1!_5157"(ptr noundef nonnull align 8 dereferenceable(24) %"dst::Array", ptr noundef nonnull align 8 dereferenceable(24) %"src::Array") #0 {
top:
%"new::StepRange" = alloca [3 x i64], align 8
; @ REPL[2]:2 within `copyto_odd1!`
; ┌ @ essentials.jl:11 within `length`
%"src::Array.size_ptr" = getelementptr inbounds i8, ptr %"src::Array", i64 16
%"src::Array.size.0.copyload" = load i64, ptr %"src::Array.size_ptr", align 8
%"dst::Array.size_ptr" = getelementptr inbounds i8, ptr %"dst::Array", i64 16
%"dst::Array.size.0.copyload" = load i64, ptr %"dst::Array.size_ptr", align 8
; └
; ┌ @ promotion.jl:637 within `==`
%.not = icmp eq i64 %"src::Array.size.0.copyload", %"dst::Array.size.0.copyload"
; └
br i1 %.not, label %L9, label %L124
L9: ; preds = %top
; @ REPL[2]:3 within `copyto_odd1!`
; ┌ @ simdloop.jl:69 within `macro expansion`
; │┌ @ range.jl:22 within `Colon`
; ││┌ @ range.jl:24 within `_colon`
; │││┌ @ range.jl:391 within `StepRange` @ range.jl:336
; ││││┌ @ range.jl:351 within `steprange_last`
; │││││┌ @ promotion.jl:637 within `==`
%.not39 = icmp eq i64 %"src::Array.size.0.copyload", 1
; │││││└
br i1 %.not39, label %L46, label %L17
L17: ; preds = %L9
; │││││ @ range.jl:354 within `steprange_last`
; │││││┌ @ operators.jl:425 within `>`
; ││││││┌ @ int.jl:83 within `<`
%0 = icmp sgt i64 %"src::Array.size.0.copyload", 1
; │││││└└
br i1 %0, label %L37, label %L46
L37: ; preds = %L17
; │││││ @ range.jl:367 within `steprange_last`
; │││││┌ @ int.jl:302 within `rem`
%1 = or i64 %"src::Array.size.0.copyload", -2
%.neg = add nsw i64 %1, 1
; │││││└
; │││││ @ range.jl:370 within `steprange_last`
%value_phi31 = add i64 %.neg, %"src::Array.size.0.copyload"
br label %L46
L46: ; preds = %L37, %L17, %L9
%value_phi = phi i64 [ %value_phi31, %L37 ], [ 0, %L17 ], [ 1, %L9 ]
; ││││└
store i64 1, ptr %"new::StepRange", align 8
%2 = getelementptr inbounds i8, ptr %"new::StepRange", i64 8
store i64 2, ptr %2, align 8
%3 = getelementptr inbounds i8, ptr %"new::StepRange", i64 16
store i64 %value_phi, ptr %3, align 8
; │└└└
; │ @ simdloop.jl:71 within `macro expansion`
; │┌ @ simdloop.jl:51 within `simd_inner_length`
%4 = call i64 @j_length_5159(ptr nocapture nonnull readonly %"new::StepRange")
; │└
; │ @ simdloop.jl:72 within `macro expansion`
; │┌ @ int.jl:83 within `<`
%5 = icmp slt i64 %4, 1
; │└
br i1 %5, label %L122, label %L56.preheader
L56.preheader: ; preds = %L46
%memoryref_data = load ptr, ptr %"src::Array", align 8
%memoryref_data10 = load ptr, ptr %"dst::Array", align 8
; │ @ simdloop.jl:75 within `macro expansion`
%min.iters.check = icmp ult i64 %4, 16
br i1 %min.iters.check, label %scalar.ph, label %vector.memcheck
vector.memcheck: ; preds = %L56.preheader
%6 = shl i64 %4, 4
%7 = add i64 %6, -8
%scevgep = getelementptr i8, ptr %memoryref_data10, i64 %7
%scevgep42 = getelementptr i8, ptr %memoryref_data, i64 %7
%bound0 = icmp ult ptr %memoryref_data10, %scevgep42
%bound1 = icmp ult ptr %memoryref_data, %scevgep
%found.conflict = and i1 %bound0, %bound1
br i1 %found.conflict, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %vector.memcheck
%n.vec = and i64 %4, 9223372036854775800
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
; │ @ simdloop.jl:78 within `macro expansion`
; │┌ @ int.jl:87 within `+`
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %vector.ph ], [ %vec.ind.next, %vector.body ]
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
%8 = shl <8 x i64> %vec.ind, <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4>
%9 = getelementptr inbounds i8, ptr %memoryref_data, <8 x i64> %8
%wide.masked.gather = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %9, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> poison)
%10 = getelementptr inbounds i8, ptr %memoryref_data10, <8 x i64> %8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> %wide.masked.gather, <8 x ptr> %10, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
; │└└
; │ @ simdloop.jl:78 within `macro expansion`
; │┌ @ int.jl:87 within `+`
%index.next = add nuw i64 %index, 8
%vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
%11 = icmp eq i64 %index.next, %n.vec
br i1 %11, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
; │└
; │ @ simdloop.jl:75 within `macro expansion`
%cmp.n = icmp eq i64 %4, %n.vec
br i1 %cmp.n, label %L122, label %scalar.ph
scalar.ph: ; preds = %middle.block, %vector.memcheck, %L56.preheader
%bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %L56.preheader ], [ 0, %vector.memcheck ]
%xtraiter = and i64 %4, 7
%lcmp.mod.not = icmp eq i64 %xtraiter, 0
br i1 %lcmp.mod.not, label %L59.prol.loopexit, label %L59.prol
L59.prol: ; preds = %L59.prol, %scalar.ph
%value_phi341.prol = phi i64 [ %13, %L59.prol ], [ %bc.resume.val, %scalar.ph ]
%prol.iter = phi i64 [ %prol.iter.next, %L59.prol ], [ 0, %scalar.ph ]
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
%memoryref_byteoffset.prol = shl i64 %value_phi341.prol, 4
%memoryref_data6.prol = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.prol
%12 = load double, ptr %memoryref_data6.prol, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%memoryref_data18.prol = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.prol
store double %12, ptr %memoryref_data18.prol, align 8
; │└└
; │ @ simdloop.jl:78 within `macro expansion`
; │┌ @ int.jl:87 within `+`
%13 = add nuw nsw i64 %value_phi341.prol, 1
; │└
; │ @ simdloop.jl:75 within `macro expansion`
%prol.iter.next = add i64 %prol.iter, 1
%prol.iter.cmp.not = icmp eq i64 %prol.iter.next, %xtraiter
br i1 %prol.iter.cmp.not, label %L59.prol.loopexit, label %L59.prol
L59.prol.loopexit: ; preds = %L59.prol, %scalar.ph
%value_phi341.unr = phi i64 [ %bc.resume.val, %scalar.ph ], [ %13, %L59.prol ]
%14 = sub nsw i64 %bc.resume.val, %4
%15 = icmp ugt i64 %14, -8
br i1 %15, label %L122, label %L59
L59: ; preds = %L59, %L59.prol.loopexit
%value_phi341 = phi i64 [ %24, %L59 ], [ %value_phi341.unr, %L59.prol.loopexit ]
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
%memoryref_byteoffset = shl i64 %value_phi341, 4
%memoryref_data6 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset
%16 = load double, ptr %memoryref_data6, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%memoryref_data18 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset
store double %16, ptr %memoryref_data18, align 8
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
%memoryref_byteoffset.1 = add i64 %memoryref_byteoffset, 16
%memoryref_data6.1 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.1
%17 = load double, ptr %memoryref_data6.1, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%memoryref_data18.1 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.1
store double %17, ptr %memoryref_data18.1, align 8
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
%memoryref_byteoffset.2 = add i64 %memoryref_byteoffset, 32
%memoryref_data6.2 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.2
%18 = load double, ptr %memoryref_data6.2, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%memoryref_data18.2 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.2
store double %18, ptr %memoryref_data18.2, align 8
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
%memoryref_byteoffset.3 = add i64 %memoryref_byteoffset, 48
%memoryref_data6.3 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.3
%19 = load double, ptr %memoryref_data6.3, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%memoryref_data18.3 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.3
store double %19, ptr %memoryref_data18.3, align 8
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
%memoryref_byteoffset.4 = add i64 %memoryref_byteoffset, 64
%memoryref_data6.4 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.4
%20 = load double, ptr %memoryref_data6.4, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%memoryref_data18.4 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.4
store double %20, ptr %memoryref_data18.4, align 8
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
%memoryref_byteoffset.5 = add i64 %memoryref_byteoffset, 80
%memoryref_data6.5 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.5
%21 = load double, ptr %memoryref_data6.5, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%memoryref_data18.5 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.5
store double %21, ptr %memoryref_data18.5, align 8
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
%memoryref_byteoffset.6 = add i64 %memoryref_byteoffset, 96
%memoryref_data6.6 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.6
%22 = load double, ptr %memoryref_data6.6, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%memoryref_data18.6 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.6
store double %22, ptr %memoryref_data18.6, align 8
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
%memoryref_byteoffset.7 = add i64 %memoryref_byteoffset, 112
%memoryref_data6.7 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.7
%23 = load double, ptr %memoryref_data6.7, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%memoryref_data18.7 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.7
store double %23, ptr %memoryref_data18.7, align 8
; │└└
; │ @ simdloop.jl:78 within `macro expansion`
; │┌ @ int.jl:87 within `+`
%24 = add nuw nsw i64 %value_phi341, 8
; │└
; │ @ simdloop.jl:75 within `macro expansion`
; │┌ @ int.jl:83 within `<`
%exitcond.not.7 = icmp eq i64 %24, %4
; │└
br i1 %exitcond.not.7, label %L122, label %L59
L122: ; preds = %L59, %L59.prol.loopexit, %middle.block, %L46
; │ @ simdloop.jl:76 within `macro expansion`
; │┌ @ simdloop.jl:54 within `simd_index`
; ││┌ @ array.jl:3134 within `getindex`
; │││┌ @ range.jl:935 within `_getindex`
; ││││┌ @ abstractarray.jl:699 within `checkbounds`
ret ptr %"dst::Array"
L124: ; preds = %top
; └└└└└
; @ REPL[2]:2 within `copyto_odd1!`
call void @j_error_5163() #12
unreachable
}copyto_odd2!
julia> code_llvm(copyto_odd2!, Tuple{Vector{Float64}, Vector{Float64}})
; Function Signature: copyto_odd2!(Array{Float64, 1}, Array{Float64, 1})
; @ REPL[3]:1 within `copyto_odd2!`
define nonnull ptr @"julia_copyto_odd2!_5170"(ptr noundef nonnull align 8 dereferenceable(24) %"dst::Array", ptr noundef nonnull align 8 dereferenceable(24) %"src::Array") #0 {
top:
%jlcallframe1 = alloca [3 x ptr], align 8
%gcframe2 = alloca [4 x ptr], align 16
call void @llvm.memset.p0.i64(ptr align 16 %gcframe2, i8 0, i64 32, i1 true)
%thread_ptr = call ptr asm "movq %fs:0, $0", "=r"() #13
%tls_ppgcstack = getelementptr inbounds i8, ptr %thread_ptr, i64 -8
%tls_pgcstack = load ptr, ptr %tls_ppgcstack, align 8
store i64 8, ptr %gcframe2, align 8
%frame.prev = getelementptr inbounds ptr, ptr %gcframe2, i64 1
%task.gcstack = load ptr, ptr %tls_pgcstack, align 8
store ptr %task.gcstack, ptr %frame.prev, align 8
store ptr %gcframe2, ptr %tls_pgcstack, align 8
; @ REPL[3]:2 within `copyto_odd2!`
; ┌ @ simdloop.jl:69 within `macro expansion`
; │┌ @ abstractarray.jl:382 within `eachindex` @ abstractarray.jl:392 @ abstractarray.jl:389
; ││┌ @ abstractarray.jl:137 within `axes1`
; │││┌ @ abstractarray.jl:98 within `axes`
; ││││┌ @ array.jl:194 within `size`
%"src::Array.size_ptr" = getelementptr inbounds i8, ptr %"src::Array", i64 16
%"src::Array.size.0.copyload" = load i64, ptr %"src::Array.size_ptr", align 8
; ││└└└
; ││ @ abstractarray.jl:382 within `eachindex` @ abstractarray.jl:393
; ││┌ @ abstractarray.jl:399 within `_all_match_first`
; │││┌ @ abstractarray.jl:393 within `#eachindex##0`
; ││││┌ @ abstractarray.jl:389 within `eachindex`
; │││││┌ @ abstractarray.jl:137 within `axes1`
; ││││││┌ @ abstractarray.jl:98 within `axes`
; │││││││┌ @ array.jl:194 within `size`
%"dst::Array.size_ptr" = getelementptr inbounds i8, ptr %"dst::Array", i64 16
%"dst::Array.size.0.copyload" = load i64, ptr %"dst::Array.size_ptr", align 8
; │││└└└└└
; │││┌ @ range.jl:1134 within `==` @ promotion.jl:637
%.not = icmp eq i64 %"src::Array.size.0.copyload", %"dst::Array.size.0.copyload"
; ││└└
br i1 %.not, label %L23, label %L12
L12: ; preds = %top
%ptls_field = getelementptr inbounds i8, ptr %tls_pgcstack, i64 16
%ptls_load = load ptr, ptr %ptls_field, align 8
%"box::OneTo" = call noalias nonnull align 8 dereferenceable(16) ptr @ijl_gc_small_alloc(ptr %ptls_load, i32 360, i32 16, i64 140177134164640) #7
%"box::OneTo.tag_addr" = getelementptr inbounds i64, ptr %"box::OneTo", i64 -1
store atomic i64 140177134164640, ptr %"box::OneTo.tag_addr" unordered, align 8
store i64 %"src::Array.size.0.copyload", ptr %"box::OneTo", align 8
%gc_slot_addr_1 = getelementptr inbounds ptr, ptr %gcframe2, i64 3
store ptr %"box::OneTo", ptr %gc_slot_addr_1, align 8
%ptls_load53 = load ptr, ptr %ptls_field, align 8
%"box::OneTo32" = call noalias nonnull align 8 dereferenceable(16) ptr @ijl_gc_small_alloc(ptr %ptls_load53, i32 360, i32 16, i64 140177134164640) #7
%"box::OneTo32.tag_addr" = getelementptr inbounds i64, ptr %"box::OneTo32", i64 -1
store atomic i64 140177134164640, ptr %"box::OneTo32.tag_addr" unordered, align 8
store i64 %"dst::Array.size.0.copyload", ptr %"box::OneTo32", align 8
%gc_slot_addr_0 = getelementptr inbounds ptr, ptr %gcframe2, i64 2
store ptr %"box::OneTo32", ptr %gc_slot_addr_0, align 8
store ptr @"jl_global#5176.jit", ptr %jlcallframe1, align 8
%0 = getelementptr inbounds ptr, ptr %jlcallframe1, i64 1
store ptr %"box::OneTo", ptr %0, align 8
%1 = getelementptr inbounds ptr, ptr %jlcallframe1, i64 2
store ptr %"box::OneTo32", ptr %1, align 8
%2 = call nonnull ptr @j1_throw_eachindex_mismatch_indices_5174(ptr nonnull @"jl_global#5175.jit", ptr nonnull %jlcallframe1, i32 3)
call void @llvm.trap()
unreachable
L23: ; preds = %top
; │└
; │ @ simdloop.jl:72 within `macro expansion`
; │┌ @ int.jl:83 within `<`
%3 = icmp slt i64 %"src::Array.size.0.copyload", 1
; │└
br i1 %3, label %L91, label %L27.preheader
L27.preheader: ; preds = %L23
%memoryref_data = load ptr, ptr %"src::Array", align 8
%memoryref_data10 = load ptr, ptr %"dst::Array", align 8
; │ @ simdloop.jl:75 within `macro expansion`
%min.iters.check = icmp ult i64 %"src::Array.size.0.copyload", 32
br i1 %min.iters.check, label %scalar.ph, label %vector.memcheck
vector.memcheck: ; preds = %L27.preheader
%4 = shl i64 %"src::Array.size.0.copyload", 3
%scevgep = getelementptr i8, ptr %memoryref_data10, i64 %4
%scevgep42 = getelementptr i8, ptr %memoryref_data, i64 %4
%bound0 = icmp ult ptr %memoryref_data10, %scevgep42
%bound1 = icmp ult ptr %memoryref_data, %scevgep
%found.conflict = and i1 %bound0, %bound1
br i1 %found.conflict, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %vector.memcheck
%n.vec = and i64 %"src::Array.size.0.copyload", 9223372036854775776
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
; │ @ simdloop.jl:78 within `macro expansion`
; │┌ @ int.jl:87 within `+`
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %vector.ph ], [ %vec.ind.next, %vector.body ]
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
; │┌ @ int.jl:117 within `isodd`
; ││┌ @ number.jl:42 within `iszero`
; │││┌ @ promotion.jl:637 within `==`
%5 = and <8 x i64> %vec.ind, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
%6 = icmp eq <8 x i64> %5, zeroinitializer
; │└└└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
%7 = shl i64 %index, 3
%8 = getelementptr i8, ptr %memoryref_data, i64 %7
%9 = getelementptr double, ptr %8, i64 8
%10 = getelementptr double, ptr %8, i64 16
%11 = getelementptr double, ptr %8, i64 24
%wide.masked.load = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %8, i32 8, <8 x i1> %6, <8 x double> poison)
%wide.masked.load46 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %9, i32 8, <8 x i1> %6, <8 x double> poison)
%wide.masked.load47 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %10, i32 8, <8 x i1> %6, <8 x double> poison)
%wide.masked.load48 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %11, i32 8, <8 x i1> %6, <8 x double> poison)
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%12 = getelementptr i8, ptr %memoryref_data10, i64 %7
%13 = getelementptr double, ptr %12, i64 8
%14 = getelementptr double, ptr %12, i64 16
%15 = getelementptr double, ptr %12, i64 24
call void @llvm.masked.store.v8f64.p0(<8 x double> %wide.masked.load, ptr %12, i32 8, <8 x i1> %6)
call void @llvm.masked.store.v8f64.p0(<8 x double> %wide.masked.load46, ptr %13, i32 8, <8 x i1> %6)
call void @llvm.masked.store.v8f64.p0(<8 x double> %wide.masked.load47, ptr %14, i32 8, <8 x i1> %6)
call void @llvm.masked.store.v8f64.p0(<8 x double> %wide.masked.load48, ptr %15, i32 8, <8 x i1> %6)
; │└└
; │ @ simdloop.jl:78 within `macro expansion`
; │┌ @ int.jl:87 within `+`
%index.next = add nuw i64 %index, 32
%vec.ind.next = add <8 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
%16 = icmp eq i64 %index.next, %n.vec
br i1 %16, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
; │└
; │ @ simdloop.jl:75 within `macro expansion`
%cmp.n = icmp eq i64 %"src::Array.size.0.copyload", %n.vec
br i1 %cmp.n, label %L91, label %scalar.ph
scalar.ph: ; preds = %middle.block, %vector.memcheck, %L27.preheader
%bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %L27.preheader ], [ 0, %vector.memcheck ]
%xtraiter = and i64 %"src::Array.size.0.copyload", 7
%lcmp.mod.not = icmp eq i64 %xtraiter, 0
br i1 %lcmp.mod.not, label %L30.prol.loopexit, label %L30.prol
L30.prol: ; preds = %L88.prol, %scalar.ph
%value_phi41.prol = phi i64 [ %19, %L88.prol ], [ %bc.resume.val, %scalar.ph ]
%prol.iter = phi i64 [ %prol.iter.next, %L88.prol ], [ 0, %scalar.ph ]
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
; │┌ @ int.jl:117 within `isodd`
; ││┌ @ number.jl:42 within `iszero`
; │││┌ @ promotion.jl:637 within `==`
%17 = and i64 %value_phi41.prol, 1
%.not40.not.prol = icmp eq i64 %17, 0
; │└└└
br i1 %.not40.not.prol, label %L64.prol, label %L88.prol
L64.prol: ; preds = %L30.prol
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
%memoryref_byteoffset.prol = shl i64 %value_phi41.prol, 3
%memoryref_data6.prol = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.prol
%18 = load double, ptr %memoryref_data6.prol, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%memoryref_data18.prol = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.prol
store double %18, ptr %memoryref_data18.prol, align 8
; ││└
br label %L88.prol
L88.prol: ; preds = %L64.prol, %L30.prol
; │└
; │ @ simdloop.jl:78 within `macro expansion`
; │┌ @ int.jl:87 within `+`
%19 = add nuw nsw i64 %value_phi41.prol, 1
; │└
; │ @ simdloop.jl:75 within `macro expansion`
%prol.iter.next = add i64 %prol.iter, 1
%prol.iter.cmp.not = icmp eq i64 %prol.iter.next, %xtraiter
br i1 %prol.iter.cmp.not, label %L30.prol.loopexit, label %L30.prol
L30.prol.loopexit: ; preds = %L88.prol, %scalar.ph
%value_phi41.unr = phi i64 [ %bc.resume.val, %scalar.ph ], [ %19, %L88.prol ]
%20 = sub nsw i64 %bc.resume.val, %"src::Array.size.0.copyload"
%21 = icmp ugt i64 %20, -8
br i1 %21, label %L91, label %scalar.ph.new
scalar.ph.new: ; preds = %L30.prol.loopexit
%22 = and i64 %value_phi41.unr, 1
br label %L30
L30: ; preds = %L88.7, %scalar.ph.new
%value_phi41 = phi i64 [ %value_phi41.unr, %scalar.ph.new ], [ %38, %L88.7 ]
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
; │┌ @ int.jl:117 within `isodd`
; ││┌ @ number.jl:42 within `iszero`
; │││┌ @ promotion.jl:637 within `==`
%.not40.not = icmp eq i64 %22, 0
; │└└└
br i1 %.not40.not, label %L64, label %L88
L64: ; preds = %L30
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
%memoryref_byteoffset = shl i64 %value_phi41, 3
%memoryref_data6 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset
%23 = load double, ptr %memoryref_data6, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%memoryref_data18 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset
store double %23, ptr %memoryref_data18, align 8
; ││└
br label %L88
L88: ; preds = %L64, %L30
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
br i1 %.not40.not, label %L88.1, label %L64.1
L64.1: ; preds = %L88
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
%24 = shl i64 %value_phi41, 3
%memoryref_byteoffset.1 = add i64 %24, 8
%memoryref_data6.1 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.1
%25 = load double, ptr %memoryref_data6.1, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%memoryref_data18.1 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.1
store double %25, ptr %memoryref_data18.1, align 8
; ││└
br label %L88.1
L88.1: ; preds = %L64.1, %L88
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
br i1 %.not40.not, label %L64.2, label %L88.2
L64.2: ; preds = %L88.1
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
%26 = shl i64 %value_phi41, 3
%memoryref_byteoffset.2 = add i64 %26, 16
%memoryref_data6.2 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.2
%27 = load double, ptr %memoryref_data6.2, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%memoryref_data18.2 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.2
store double %27, ptr %memoryref_data18.2, align 8
; ││└
br label %L88.2
L88.2: ; preds = %L64.2, %L88.1
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
br i1 %.not40.not, label %L88.3, label %L64.3
L64.3: ; preds = %L88.2
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
%28 = shl i64 %value_phi41, 3
%memoryref_byteoffset.3 = add i64 %28, 24
%memoryref_data6.3 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.3
%29 = load double, ptr %memoryref_data6.3, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%memoryref_data18.3 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.3
store double %29, ptr %memoryref_data18.3, align 8
; ││└
br label %L88.3
L88.3: ; preds = %L64.3, %L88.2
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
br i1 %.not40.not, label %L64.4, label %L88.4
L64.4: ; preds = %L88.3
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
%30 = shl i64 %value_phi41, 3
%memoryref_byteoffset.4 = add i64 %30, 32
%memoryref_data6.4 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.4
%31 = load double, ptr %memoryref_data6.4, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%memoryref_data18.4 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.4
store double %31, ptr %memoryref_data18.4, align 8
; ││└
br label %L88.4
L88.4: ; preds = %L64.4, %L88.3
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
br i1 %.not40.not, label %L88.5, label %L64.5
L64.5: ; preds = %L88.4
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
%32 = shl i64 %value_phi41, 3
%memoryref_byteoffset.5 = add i64 %32, 40
%memoryref_data6.5 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.5
%33 = load double, ptr %memoryref_data6.5, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%memoryref_data18.5 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.5
store double %33, ptr %memoryref_data18.5, align 8
; ││└
br label %L88.5
L88.5: ; preds = %L64.5, %L88.4
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
br i1 %.not40.not, label %L64.6, label %L88.6
L64.6: ; preds = %L88.5
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
%34 = shl i64 %value_phi41, 3
%memoryref_byteoffset.6 = add i64 %34, 48
%memoryref_data6.6 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.6
%35 = load double, ptr %memoryref_data6.6, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%memoryref_data18.6 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.6
store double %35, ptr %memoryref_data18.6, align 8
; ││└
br label %L88.6
L88.6: ; preds = %L64.6, %L88.5
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
br i1 %.not40.not, label %L88.7, label %L64.7
L64.7: ; preds = %L88.6
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
%36 = shl i64 %value_phi41, 3
%memoryref_byteoffset.7 = add i64 %36, 56
%memoryref_data6.7 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.7
%37 = load double, ptr %memoryref_data6.7, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
%memoryref_data18.7 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.7
store double %37, ptr %memoryref_data18.7, align 8
; ││└
br label %L88.7
L88.7: ; preds = %L64.7, %L88.6
; │└
; │ @ simdloop.jl:78 within `macro expansion`
; │┌ @ int.jl:87 within `+`
%38 = add nuw nsw i64 %value_phi41, 8
; │└
; │ @ simdloop.jl:75 within `macro expansion`
; │┌ @ int.jl:83 within `<`
%exitcond.not.7 = icmp eq i64 %38, %"src::Array.size.0.copyload"
; │└
br i1 %exitcond.not.7, label %L91, label %L30
L91: ; preds = %L88.7, %L30.prol.loopexit, %middle.block, %L23
%frame.prev59 = load ptr, ptr %frame.prev, align 8
store ptr %frame.prev59, ptr %tls_pgcstack, align 8
; │ @ simdloop.jl:76 within `macro expansion`
; │┌ @ simdloop.jl:54 within `simd_index`
; ││┌ @ array.jl:3134 within `getindex`
; │││┌ @ range.jl:935 within `_getindex`
; ││││┌ @ abstractarray.jl:699 within `checkbounds`
ret ptr %"dst::Array"
; └└└└└
}Metadata
Metadata
Assignees
Labels
performanceMust go fasterMust go faster