Skip to content

Missed optimization in strided copyto-like operation #60147

@MasonProtter

Description

@MasonProtter

Consider

function copyto_odd1!(dst::Vector{T}, src::Vector{T}) where {T}
    length(src) == length(dst) || error()
    @simd for i  1:2:length(src)
        @inbounds src_i = src[i]
        @inbounds dst[i] = src_i
    end
    dst
end

and

function copyto_odd2!(dst::Vector{T}, src::Vector{T}) where {T}
    @simd for i  eachindex(src, dst)
        if isodd(i)
            @inbounds src_i = src[i]
            @inbounds dst[i] = src_i
        end  
    end
    dst
end

I find a substantial performance difference between these two functions:

julia> let
           dst = rand(800)
           src = rand(800)
           
           print("copyto_odd1!: "); @btime copyto_odd1!($dst, $src)
           print("copyto_odd2!: "); @btime copyto_odd2!($dst, $src)
       end;
copyto_odd1!:   251.667 ns (0 allocations: 0 bytes)
copyto_odd2!:   42.977 ns (0 allocations: 0 bytes)

and it seems to be about how the vectorization opportunities are found and implemented. As pointed out in a Zulip conversation, it seems that copyto_odd1! ends up using masked.gather/masked.scatter instructions whereas copyto_odd2! uses masked.load and masked.store instructions.

I get that this is likely a problem on LLVMs end, but I'm wondering if there's ways we can nudge codegen towards generating LLVM code that's more likely to properly vectorize here.


Edit: I accidentally wrote the body of copyto_odd2! without the if statement, now fixed


Edit2:

I tested this on 1.12.1 and nightly, using this machine:

julia> versioninfo()
Julia Version 1.12.1
Commit ba1e628ee49 (2025-10-17 13:02 UTC)
Build Info:
  Official https://julialang.org release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 16 × AMD Ryzen 7 7840U w/ Radeon  780M Graphics
  WORD_SIZE: 64
  LLVM: libLLVM-18.1.7 (ORCJIT, znver4)
  GC: Built with stock GC
Threads: 8 default, 1 interactive, 8 GC (on 16 virtual cores)
Environment:
  JULIA_NUM_THREADS = 8
  JULIA_EDITOR = emacsclient

Here's the code_llvm I get on 1.12.1:

copyto_odd1!
julia> code_llvm(copyto_odd1!, Tuple{Vector{Float64}, Vector{Float64}})
; Function Signature: copyto_odd1!(Array{Float64, 1}, Array{Float64, 1})
;  @ REPL[2]:1 within `copyto_odd1!`
define nonnull ptr @"julia_copyto_odd1!_5157"(ptr noundef nonnull align 8 dereferenceable(24) %"dst::Array", ptr noundef nonnull align 8 dereferenceable(24) %"src::Array") #0 {
top:
  %"new::StepRange" = alloca [3 x i64], align 8
;  @ REPL[2]:2 within `copyto_odd1!`
; ┌ @ essentials.jl:11 within `length`
   %"src::Array.size_ptr" = getelementptr inbounds i8, ptr %"src::Array", i64 16
   %"src::Array.size.0.copyload" = load i64, ptr %"src::Array.size_ptr", align 8
   %"dst::Array.size_ptr" = getelementptr inbounds i8, ptr %"dst::Array", i64 16
   %"dst::Array.size.0.copyload" = load i64, ptr %"dst::Array.size_ptr", align 8
; └
; ┌ @ promotion.jl:637 within `==`
   %.not = icmp eq i64 %"src::Array.size.0.copyload", %"dst::Array.size.0.copyload"
; └
  br i1 %.not, label %L9, label %L124

L9:                                               ; preds = %top
;  @ REPL[2]:3 within `copyto_odd1!`
; ┌ @ simdloop.jl:69 within `macro expansion`
; │┌ @ range.jl:22 within `Colon`
; ││┌ @ range.jl:24 within `_colon`
; │││┌ @ range.jl:391 within `StepRange` @ range.jl:336
; ││││┌ @ range.jl:351 within `steprange_last`
; │││││┌ @ promotion.jl:637 within `==`
        %.not39 = icmp eq i64 %"src::Array.size.0.copyload", 1
; │││││└
       br i1 %.not39, label %L46, label %L17

L17:                                              ; preds = %L9
; │││││ @ range.jl:354 within `steprange_last`
; │││││┌ @ operators.jl:425 within `>`
; ││││││┌ @ int.jl:83 within `<`
         %0 = icmp sgt i64 %"src::Array.size.0.copyload", 1
; │││││└└
       br i1 %0, label %L37, label %L46

L37:                                              ; preds = %L17
; │││││ @ range.jl:367 within `steprange_last`
; │││││┌ @ int.jl:302 within `rem`
        %1 = or i64 %"src::Array.size.0.copyload", -2
        %.neg = add nsw i64 %1, 1
; │││││└
; │││││ @ range.jl:370 within `steprange_last`
       %value_phi31 = add i64 %.neg, %"src::Array.size.0.copyload"
       br label %L46

L46:                                              ; preds = %L37, %L17, %L9
       %value_phi = phi i64 [ %value_phi31, %L37 ], [ 0, %L17 ], [ 1, %L9 ]
; ││││└
      store i64 1, ptr %"new::StepRange", align 8
      %2 = getelementptr inbounds i8, ptr %"new::StepRange", i64 8
      store i64 2, ptr %2, align 8
      %3 = getelementptr inbounds i8, ptr %"new::StepRange", i64 16
      store i64 %value_phi, ptr %3, align 8
; │└└└
; │ @ simdloop.jl:71 within `macro expansion`
; │┌ @ simdloop.jl:51 within `simd_inner_length`
    %4 = call i64 @j_length_5159(ptr nocapture nonnull readonly %"new::StepRange")
; │└
; │ @ simdloop.jl:72 within `macro expansion`
; │┌ @ int.jl:83 within `<`
    %5 = icmp slt i64 %4, 1
; │└
   br i1 %5, label %L122, label %L56.preheader

L56.preheader:                                    ; preds = %L46
   %memoryref_data = load ptr, ptr %"src::Array", align 8
   %memoryref_data10 = load ptr, ptr %"dst::Array", align 8
; │ @ simdloop.jl:75 within `macro expansion`
   %min.iters.check = icmp ult i64 %4, 16
   br i1 %min.iters.check, label %scalar.ph, label %vector.memcheck

vector.memcheck:                                  ; preds = %L56.preheader
   %6 = shl i64 %4, 4
   %7 = add i64 %6, -8
   %scevgep = getelementptr i8, ptr %memoryref_data10, i64 %7
   %scevgep42 = getelementptr i8, ptr %memoryref_data, i64 %7
   %bound0 = icmp ult ptr %memoryref_data10, %scevgep42
   %bound1 = icmp ult ptr %memoryref_data, %scevgep
   %found.conflict = and i1 %bound0, %bound1
   br i1 %found.conflict, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %vector.memcheck
   %n.vec = and i64 %4, 9223372036854775800
   br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
; │ @ simdloop.jl:78 within `macro expansion`
; │┌ @ int.jl:87 within `+`
    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
    %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %vector.ph ], [ %vec.ind.next, %vector.body ]
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
    %8 = shl <8 x i64> %vec.ind, <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4>
    %9 = getelementptr inbounds i8, ptr %memoryref_data, <8 x i64> %8
    %wide.masked.gather = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %9, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> poison)
    %10 = getelementptr inbounds i8, ptr %memoryref_data10, <8 x i64> %8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> %wide.masked.gather, <8 x ptr> %10, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
; │└└
; │ @ simdloop.jl:78 within `macro expansion`
; │┌ @ int.jl:87 within `+`
    %index.next = add nuw i64 %index, 8
    %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
    %11 = icmp eq i64 %index.next, %n.vec
    br i1 %11, label %middle.block, label %vector.body

middle.block:                                     ; preds = %vector.body
; │└
; │ @ simdloop.jl:75 within `macro expansion`
   %cmp.n = icmp eq i64 %4, %n.vec
   br i1 %cmp.n, label %L122, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block, %vector.memcheck, %L56.preheader
   %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %L56.preheader ], [ 0, %vector.memcheck ]
   %xtraiter = and i64 %4, 7
   %lcmp.mod.not = icmp eq i64 %xtraiter, 0
   br i1 %lcmp.mod.not, label %L59.prol.loopexit, label %L59.prol

L59.prol:                                         ; preds = %L59.prol, %scalar.ph
   %value_phi341.prol = phi i64 [ %13, %L59.prol ], [ %bc.resume.val, %scalar.ph ]
   %prol.iter = phi i64 [ %prol.iter.next, %L59.prol ], [ 0, %scalar.ph ]
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
    %memoryref_byteoffset.prol = shl i64 %value_phi341.prol, 4
    %memoryref_data6.prol = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.prol
    %12 = load double, ptr %memoryref_data6.prol, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %memoryref_data18.prol = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.prol
     store double %12, ptr %memoryref_data18.prol, align 8
; │└└
; │ @ simdloop.jl:78 within `macro expansion`
; │┌ @ int.jl:87 within `+`
    %13 = add nuw nsw i64 %value_phi341.prol, 1
; │└
; │ @ simdloop.jl:75 within `macro expansion`
   %prol.iter.next = add i64 %prol.iter, 1
   %prol.iter.cmp.not = icmp eq i64 %prol.iter.next, %xtraiter
   br i1 %prol.iter.cmp.not, label %L59.prol.loopexit, label %L59.prol

L59.prol.loopexit:                                ; preds = %L59.prol, %scalar.ph
   %value_phi341.unr = phi i64 [ %bc.resume.val, %scalar.ph ], [ %13, %L59.prol ]
   %14 = sub nsw i64 %bc.resume.val, %4
   %15 = icmp ugt i64 %14, -8
   br i1 %15, label %L122, label %L59

L59:                                              ; preds = %L59, %L59.prol.loopexit
   %value_phi341 = phi i64 [ %24, %L59 ], [ %value_phi341.unr, %L59.prol.loopexit ]
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
    %memoryref_byteoffset = shl i64 %value_phi341, 4
    %memoryref_data6 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset
    %16 = load double, ptr %memoryref_data6, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %memoryref_data18 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset
     store double %16, ptr %memoryref_data18, align 8
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
    %memoryref_byteoffset.1 = add i64 %memoryref_byteoffset, 16
    %memoryref_data6.1 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.1
    %17 = load double, ptr %memoryref_data6.1, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %memoryref_data18.1 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.1
     store double %17, ptr %memoryref_data18.1, align 8
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
    %memoryref_byteoffset.2 = add i64 %memoryref_byteoffset, 32
    %memoryref_data6.2 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.2
    %18 = load double, ptr %memoryref_data6.2, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %memoryref_data18.2 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.2
     store double %18, ptr %memoryref_data18.2, align 8
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
    %memoryref_byteoffset.3 = add i64 %memoryref_byteoffset, 48
    %memoryref_data6.3 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.3
    %19 = load double, ptr %memoryref_data6.3, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %memoryref_data18.3 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.3
     store double %19, ptr %memoryref_data18.3, align 8
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
    %memoryref_byteoffset.4 = add i64 %memoryref_byteoffset, 64
    %memoryref_data6.4 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.4
    %20 = load double, ptr %memoryref_data6.4, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %memoryref_data18.4 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.4
     store double %20, ptr %memoryref_data18.4, align 8
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
    %memoryref_byteoffset.5 = add i64 %memoryref_byteoffset, 80
    %memoryref_data6.5 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.5
    %21 = load double, ptr %memoryref_data6.5, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %memoryref_data18.5 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.5
     store double %21, ptr %memoryref_data18.5, align 8
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
    %memoryref_byteoffset.6 = add i64 %memoryref_byteoffset, 96
    %memoryref_data6.6 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.6
    %22 = load double, ptr %memoryref_data6.6, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %memoryref_data18.6 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.6
     store double %22, ptr %memoryref_data18.6, align 8
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:4
; │┌ @ essentials.jl:920 within `getindex`
    %memoryref_byteoffset.7 = add i64 %memoryref_byteoffset, 112
    %memoryref_data6.7 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.7
    %23 = load double, ptr %memoryref_data6.7, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[2]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %memoryref_data18.7 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.7
     store double %23, ptr %memoryref_data18.7, align 8
; │└└
; │ @ simdloop.jl:78 within `macro expansion`
; │┌ @ int.jl:87 within `+`
    %24 = add nuw nsw i64 %value_phi341, 8
; │└
; │ @ simdloop.jl:75 within `macro expansion`
; │┌ @ int.jl:83 within `<`
    %exitcond.not.7 = icmp eq i64 %24, %4
; │└
   br i1 %exitcond.not.7, label %L122, label %L59

L122:                                             ; preds = %L59, %L59.prol.loopexit, %middle.block, %L46
; │ @ simdloop.jl:76 within `macro expansion`
; │┌ @ simdloop.jl:54 within `simd_index`
; ││┌ @ array.jl:3134 within `getindex`
; │││┌ @ range.jl:935 within `_getindex`
; ││││┌ @ abstractarray.jl:699 within `checkbounds`
       ret ptr %"dst::Array"

L124:                                             ; preds = %top
; └└└└└
;  @ REPL[2]:2 within `copyto_odd1!`
  call void @j_error_5163() #12
  unreachable
}
copyto_odd2!
julia> code_llvm(copyto_odd2!, Tuple{Vector{Float64}, Vector{Float64}})
; Function Signature: copyto_odd2!(Array{Float64, 1}, Array{Float64, 1})
;  @ REPL[3]:1 within `copyto_odd2!`
define nonnull ptr @"julia_copyto_odd2!_5170"(ptr noundef nonnull align 8 dereferenceable(24) %"dst::Array", ptr noundef nonnull align 8 dereferenceable(24) %"src::Array") #0 {
top:
  %jlcallframe1 = alloca [3 x ptr], align 8
  %gcframe2 = alloca [4 x ptr], align 16
  call void @llvm.memset.p0.i64(ptr align 16 %gcframe2, i8 0, i64 32, i1 true)
  %thread_ptr = call ptr asm "movq %fs:0, $0", "=r"() #13
  %tls_ppgcstack = getelementptr inbounds i8, ptr %thread_ptr, i64 -8
  %tls_pgcstack = load ptr, ptr %tls_ppgcstack, align 8
  store i64 8, ptr %gcframe2, align 8
  %frame.prev = getelementptr inbounds ptr, ptr %gcframe2, i64 1
  %task.gcstack = load ptr, ptr %tls_pgcstack, align 8
  store ptr %task.gcstack, ptr %frame.prev, align 8
  store ptr %gcframe2, ptr %tls_pgcstack, align 8
;  @ REPL[3]:2 within `copyto_odd2!`
; ┌ @ simdloop.jl:69 within `macro expansion`
; │┌ @ abstractarray.jl:382 within `eachindex` @ abstractarray.jl:392 @ abstractarray.jl:389
; ││┌ @ abstractarray.jl:137 within `axes1`
; │││┌ @ abstractarray.jl:98 within `axes`
; ││││┌ @ array.jl:194 within `size`
       %"src::Array.size_ptr" = getelementptr inbounds i8, ptr %"src::Array", i64 16
       %"src::Array.size.0.copyload" = load i64, ptr %"src::Array.size_ptr", align 8
; ││└└└
; ││ @ abstractarray.jl:382 within `eachindex` @ abstractarray.jl:393
; ││┌ @ abstractarray.jl:399 within `_all_match_first`
; │││┌ @ abstractarray.jl:393 within `#eachindex##0`
; ││││┌ @ abstractarray.jl:389 within `eachindex`
; │││││┌ @ abstractarray.jl:137 within `axes1`
; ││││││┌ @ abstractarray.jl:98 within `axes`
; │││││││┌ @ array.jl:194 within `size`
          %"dst::Array.size_ptr" = getelementptr inbounds i8, ptr %"dst::Array", i64 16
          %"dst::Array.size.0.copyload" = load i64, ptr %"dst::Array.size_ptr", align 8
; │││└└└└└
; │││┌ @ range.jl:1134 within `==` @ promotion.jl:637
      %.not = icmp eq i64 %"src::Array.size.0.copyload", %"dst::Array.size.0.copyload"
; ││└└
    br i1 %.not, label %L23, label %L12

L12:                                              ; preds = %top
    %ptls_field = getelementptr inbounds i8, ptr %tls_pgcstack, i64 16
    %ptls_load = load ptr, ptr %ptls_field, align 8
    %"box::OneTo" = call noalias nonnull align 8 dereferenceable(16) ptr @ijl_gc_small_alloc(ptr %ptls_load, i32 360, i32 16, i64 140177134164640) #7
    %"box::OneTo.tag_addr" = getelementptr inbounds i64, ptr %"box::OneTo", i64 -1
    store atomic i64 140177134164640, ptr %"box::OneTo.tag_addr" unordered, align 8
    store i64 %"src::Array.size.0.copyload", ptr %"box::OneTo", align 8
    %gc_slot_addr_1 = getelementptr inbounds ptr, ptr %gcframe2, i64 3
    store ptr %"box::OneTo", ptr %gc_slot_addr_1, align 8
    %ptls_load53 = load ptr, ptr %ptls_field, align 8
    %"box::OneTo32" = call noalias nonnull align 8 dereferenceable(16) ptr @ijl_gc_small_alloc(ptr %ptls_load53, i32 360, i32 16, i64 140177134164640) #7
    %"box::OneTo32.tag_addr" = getelementptr inbounds i64, ptr %"box::OneTo32", i64 -1
    store atomic i64 140177134164640, ptr %"box::OneTo32.tag_addr" unordered, align 8
    store i64 %"dst::Array.size.0.copyload", ptr %"box::OneTo32", align 8
    %gc_slot_addr_0 = getelementptr inbounds ptr, ptr %gcframe2, i64 2
    store ptr %"box::OneTo32", ptr %gc_slot_addr_0, align 8
    store ptr @"jl_global#5176.jit", ptr %jlcallframe1, align 8
    %0 = getelementptr inbounds ptr, ptr %jlcallframe1, i64 1
    store ptr %"box::OneTo", ptr %0, align 8
    %1 = getelementptr inbounds ptr, ptr %jlcallframe1, i64 2
    store ptr %"box::OneTo32", ptr %1, align 8
    %2 = call nonnull ptr @j1_throw_eachindex_mismatch_indices_5174(ptr nonnull @"jl_global#5175.jit", ptr nonnull %jlcallframe1, i32 3)
    call void @llvm.trap()
    unreachable

L23:                                              ; preds = %top
; │└
; │ @ simdloop.jl:72 within `macro expansion`
; │┌ @ int.jl:83 within `<`
    %3 = icmp slt i64 %"src::Array.size.0.copyload", 1
; │└
   br i1 %3, label %L91, label %L27.preheader

L27.preheader:                                    ; preds = %L23
   %memoryref_data = load ptr, ptr %"src::Array", align 8
   %memoryref_data10 = load ptr, ptr %"dst::Array", align 8
; │ @ simdloop.jl:75 within `macro expansion`
   %min.iters.check = icmp ult i64 %"src::Array.size.0.copyload", 32
   br i1 %min.iters.check, label %scalar.ph, label %vector.memcheck

vector.memcheck:                                  ; preds = %L27.preheader
   %4 = shl i64 %"src::Array.size.0.copyload", 3
   %scevgep = getelementptr i8, ptr %memoryref_data10, i64 %4
   %scevgep42 = getelementptr i8, ptr %memoryref_data, i64 %4
   %bound0 = icmp ult ptr %memoryref_data10, %scevgep42
   %bound1 = icmp ult ptr %memoryref_data, %scevgep
   %found.conflict = and i1 %bound0, %bound1
   br i1 %found.conflict, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %vector.memcheck
   %n.vec = and i64 %"src::Array.size.0.copyload", 9223372036854775776
   br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
; │ @ simdloop.jl:78 within `macro expansion`
; │┌ @ int.jl:87 within `+`
    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
    %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %vector.ph ], [ %vec.ind.next, %vector.body ]
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
; │┌ @ int.jl:117 within `isodd`
; ││┌ @ number.jl:42 within `iszero`
; │││┌ @ promotion.jl:637 within `==`
      %5 = and <8 x i64> %vec.ind, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
      %6 = icmp eq <8 x i64> %5, zeroinitializer
; │└└└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
    %7 = shl i64 %index, 3
    %8 = getelementptr i8, ptr %memoryref_data, i64 %7
    %9 = getelementptr double, ptr %8, i64 8
    %10 = getelementptr double, ptr %8, i64 16
    %11 = getelementptr double, ptr %8, i64 24
    %wide.masked.load = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %8, i32 8, <8 x i1> %6, <8 x double> poison)
    %wide.masked.load46 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %9, i32 8, <8 x i1> %6, <8 x double> poison)
    %wide.masked.load47 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %10, i32 8, <8 x i1> %6, <8 x double> poison)
    %wide.masked.load48 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %11, i32 8, <8 x i1> %6, <8 x double> poison)
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %12 = getelementptr i8, ptr %memoryref_data10, i64 %7
     %13 = getelementptr double, ptr %12, i64 8
     %14 = getelementptr double, ptr %12, i64 16
     %15 = getelementptr double, ptr %12, i64 24
     call void @llvm.masked.store.v8f64.p0(<8 x double> %wide.masked.load, ptr %12, i32 8, <8 x i1> %6)
     call void @llvm.masked.store.v8f64.p0(<8 x double> %wide.masked.load46, ptr %13, i32 8, <8 x i1> %6)
     call void @llvm.masked.store.v8f64.p0(<8 x double> %wide.masked.load47, ptr %14, i32 8, <8 x i1> %6)
     call void @llvm.masked.store.v8f64.p0(<8 x double> %wide.masked.load48, ptr %15, i32 8, <8 x i1> %6)
; │└└
; │ @ simdloop.jl:78 within `macro expansion`
; │┌ @ int.jl:87 within `+`
    %index.next = add nuw i64 %index, 32
    %vec.ind.next = add <8 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
    %16 = icmp eq i64 %index.next, %n.vec
    br i1 %16, label %middle.block, label %vector.body

middle.block:                                     ; preds = %vector.body
; │└
; │ @ simdloop.jl:75 within `macro expansion`
   %cmp.n = icmp eq i64 %"src::Array.size.0.copyload", %n.vec
   br i1 %cmp.n, label %L91, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block, %vector.memcheck, %L27.preheader
   %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %L27.preheader ], [ 0, %vector.memcheck ]
   %xtraiter = and i64 %"src::Array.size.0.copyload", 7
   %lcmp.mod.not = icmp eq i64 %xtraiter, 0
   br i1 %lcmp.mod.not, label %L30.prol.loopexit, label %L30.prol

L30.prol:                                         ; preds = %L88.prol, %scalar.ph
   %value_phi41.prol = phi i64 [ %19, %L88.prol ], [ %bc.resume.val, %scalar.ph ]
   %prol.iter = phi i64 [ %prol.iter.next, %L88.prol ], [ 0, %scalar.ph ]
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
; │┌ @ int.jl:117 within `isodd`
; ││┌ @ number.jl:42 within `iszero`
; │││┌ @ promotion.jl:637 within `==`
      %17 = and i64 %value_phi41.prol, 1
      %.not40.not.prol = icmp eq i64 %17, 0
; │└└└
   br i1 %.not40.not.prol, label %L64.prol, label %L88.prol

L64.prol:                                         ; preds = %L30.prol
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
    %memoryref_byteoffset.prol = shl i64 %value_phi41.prol, 3
    %memoryref_data6.prol = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.prol
    %18 = load double, ptr %memoryref_data6.prol, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %memoryref_data18.prol = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.prol
     store double %18, ptr %memoryref_data18.prol, align 8
; ││└
    br label %L88.prol

L88.prol:                                         ; preds = %L64.prol, %L30.prol
; │└
; │ @ simdloop.jl:78 within `macro expansion`
; │┌ @ int.jl:87 within `+`
    %19 = add nuw nsw i64 %value_phi41.prol, 1
; │└
; │ @ simdloop.jl:75 within `macro expansion`
   %prol.iter.next = add i64 %prol.iter, 1
   %prol.iter.cmp.not = icmp eq i64 %prol.iter.next, %xtraiter
   br i1 %prol.iter.cmp.not, label %L30.prol.loopexit, label %L30.prol

L30.prol.loopexit:                                ; preds = %L88.prol, %scalar.ph
   %value_phi41.unr = phi i64 [ %bc.resume.val, %scalar.ph ], [ %19, %L88.prol ]
   %20 = sub nsw i64 %bc.resume.val, %"src::Array.size.0.copyload"
   %21 = icmp ugt i64 %20, -8
   br i1 %21, label %L91, label %scalar.ph.new

scalar.ph.new:                                    ; preds = %L30.prol.loopexit
   %22 = and i64 %value_phi41.unr, 1
   br label %L30

L30:                                              ; preds = %L88.7, %scalar.ph.new
   %value_phi41 = phi i64 [ %value_phi41.unr, %scalar.ph.new ], [ %38, %L88.7 ]
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
; │┌ @ int.jl:117 within `isodd`
; ││┌ @ number.jl:42 within `iszero`
; │││┌ @ promotion.jl:637 within `==`
      %.not40.not = icmp eq i64 %22, 0
; │└└└
   br i1 %.not40.not, label %L64, label %L88

L64:                                              ; preds = %L30
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
    %memoryref_byteoffset = shl i64 %value_phi41, 3
    %memoryref_data6 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset
    %23 = load double, ptr %memoryref_data6, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %memoryref_data18 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset
     store double %23, ptr %memoryref_data18, align 8
; ││└
    br label %L88

L88:                                              ; preds = %L64, %L30
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
   br i1 %.not40.not, label %L88.1, label %L64.1

L64.1:                                            ; preds = %L88
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
    %24 = shl i64 %value_phi41, 3
    %memoryref_byteoffset.1 = add i64 %24, 8
    %memoryref_data6.1 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.1
    %25 = load double, ptr %memoryref_data6.1, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %memoryref_data18.1 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.1
     store double %25, ptr %memoryref_data18.1, align 8
; ││└
    br label %L88.1

L88.1:                                            ; preds = %L64.1, %L88
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
   br i1 %.not40.not, label %L64.2, label %L88.2

L64.2:                                            ; preds = %L88.1
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
    %26 = shl i64 %value_phi41, 3
    %memoryref_byteoffset.2 = add i64 %26, 16
    %memoryref_data6.2 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.2
    %27 = load double, ptr %memoryref_data6.2, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %memoryref_data18.2 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.2
     store double %27, ptr %memoryref_data18.2, align 8
; ││└
    br label %L88.2

L88.2:                                            ; preds = %L64.2, %L88.1
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
   br i1 %.not40.not, label %L88.3, label %L64.3

L64.3:                                            ; preds = %L88.2
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
    %28 = shl i64 %value_phi41, 3
    %memoryref_byteoffset.3 = add i64 %28, 24
    %memoryref_data6.3 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.3
    %29 = load double, ptr %memoryref_data6.3, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %memoryref_data18.3 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.3
     store double %29, ptr %memoryref_data18.3, align 8
; ││└
    br label %L88.3

L88.3:                                            ; preds = %L64.3, %L88.2
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
   br i1 %.not40.not, label %L64.4, label %L88.4

L64.4:                                            ; preds = %L88.3
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
    %30 = shl i64 %value_phi41, 3
    %memoryref_byteoffset.4 = add i64 %30, 32
    %memoryref_data6.4 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.4
    %31 = load double, ptr %memoryref_data6.4, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %memoryref_data18.4 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.4
     store double %31, ptr %memoryref_data18.4, align 8
; ││└
    br label %L88.4

L88.4:                                            ; preds = %L64.4, %L88.3
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
   br i1 %.not40.not, label %L88.5, label %L64.5

L64.5:                                            ; preds = %L88.4
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
    %32 = shl i64 %value_phi41, 3
    %memoryref_byteoffset.5 = add i64 %32, 40
    %memoryref_data6.5 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.5
    %33 = load double, ptr %memoryref_data6.5, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %memoryref_data18.5 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.5
     store double %33, ptr %memoryref_data18.5, align 8
; ││└
    br label %L88.5

L88.5:                                            ; preds = %L64.5, %L88.4
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
   br i1 %.not40.not, label %L64.6, label %L88.6

L64.6:                                            ; preds = %L88.5
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
    %34 = shl i64 %value_phi41, 3
    %memoryref_byteoffset.6 = add i64 %34, 48
    %memoryref_data6.6 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.6
    %35 = load double, ptr %memoryref_data6.6, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %memoryref_data18.6 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.6
     store double %35, ptr %memoryref_data18.6, align 8
; ││└
    br label %L88.6

L88.6:                                            ; preds = %L64.6, %L88.5
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:3
   br i1 %.not40.not, label %L88.7, label %L64.7

L64.7:                                            ; preds = %L88.6
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:4
; │┌ @ essentials.jl:920 within `getindex`
    %36 = shl i64 %value_phi41, 3
    %memoryref_byteoffset.7 = add i64 %36, 56
    %memoryref_data6.7 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.7
    %37 = load double, ptr %memoryref_data6.7, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ REPL[3]:5
; │┌ @ array.jl:986 within `setindex!`
; ││┌ @ array.jl:991 within `_setindex!`
     %memoryref_data18.7 = getelementptr inbounds i8, ptr %memoryref_data10, i64 %memoryref_byteoffset.7
     store double %37, ptr %memoryref_data18.7, align 8
; ││└
    br label %L88.7

L88.7:                                            ; preds = %L64.7, %L88.6
; │└
; │ @ simdloop.jl:78 within `macro expansion`
; │┌ @ int.jl:87 within `+`
    %38 = add nuw nsw i64 %value_phi41, 8
; │└
; │ @ simdloop.jl:75 within `macro expansion`
; │┌ @ int.jl:83 within `<`
    %exitcond.not.7 = icmp eq i64 %38, %"src::Array.size.0.copyload"
; │└
   br i1 %exitcond.not.7, label %L91, label %L30

L91:                                              ; preds = %L88.7, %L30.prol.loopexit, %middle.block, %L23
   %frame.prev59 = load ptr, ptr %frame.prev, align 8
   store ptr %frame.prev59, ptr %tls_pgcstack, align 8
; │ @ simdloop.jl:76 within `macro expansion`
; │┌ @ simdloop.jl:54 within `simd_index`
; ││┌ @ array.jl:3134 within `getindex`
; │││┌ @ range.jl:935 within `_getindex`
; ││││┌ @ abstractarray.jl:699 within `checkbounds`
       ret ptr %"dst::Array"
; └└└└└
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions