Skip to content

Commit

Permalink
[ARM] Simplify address calculation for NEON load/store
Browse files Browse the repository at this point in the history
The patch attempts to optimize a sequence of SIMD loads from the same
base pointer:

    %0 = gep float*, float* base, i32 4
    %1 = bitcast float* %0 to <4 x float>*
    %2 = load <4 x float>, <4 x float>* %1
    ...
    %n1 = gep float*, float* base, i32 N
    %n2 = bitcast float* %n1 to <4 x float>*
    %n3 = load <4 x float>, <4 x float>* %n2

For AArch64 the compiler generates a sequence of LDR Qt, [Xn, rust-lang#16].
However, 32-bit NEON VLD1/VST1 lack the [Wn, #imm] addressing mode, so
the address is computed before every ld/st instruction:

    add r2, r0, rust-lang#32
    add r0, r0, rust-lang#16
    vld1.32 {d18, d19}, [r2]
    vld1.32 {d22, d23}, [r0]

This can be improved by computing address for the first load, and then
using a post-indexed form of VLD1/VST1 to load the rest:

    add r0, r0, rust-lang#16
    vld1.32 {d18, d19}, [r0]!
    vld1.32 {d22, d23}, [r0]

In order to do that, the patch adds more patterns to DAGCombine:

  - (load (add ptr inc1)) and (add ptr inc2) are now folded if inc1
    and inc2 are constants.

  - (or ptr inc) is now recognized as a pointer increment if ptr is
    sufficiently aligned.

In addition to that, we now search for all possible base updates and
then pick the best one.

Differential Revision: https://reviews.llvm.org/D108988
  • Loading branch information
asavonic committed Oct 14, 2021
1 parent 8848766 commit dc8a41d
Show file tree
Hide file tree
Showing 12 changed files with 1,113 additions and 614 deletions.
654 changes: 445 additions & 209 deletions llvm/lib/Target/ARM/ARMISelLowering.cpp

Large diffs are not rendered by default.

105 changes: 41 additions & 64 deletions llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,80 +3,57 @@
; rdar://12713765
; When realign-stack is set to false, make sure we are not creating stack
; objects that are assumed to be 64-byte aligned.
@T3_retval = common global <16 x float> zeroinitializer, align 16

define void @test1(<16 x float>* noalias sret(<16 x float>) %agg.result) nounwind ssp "no-realign-stack" {
entry:
; CHECK-LABEL: test1:
; CHECK: ldr r[[R1:[0-9]+]], [pc, r[[R1]]]
; CHECK: mov r[[R2:[0-9]+]], r[[R1]]
; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]!
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
; CHECK: add r[[R3:[0-9]+]], r[[R1]], #32
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
; CHECK: add r[[R3:[0-9]+]], r[[R1]], #48
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
; CHECK: mov r[[R2:[0-9]+]], sp
; CHECK: add r[[R3:[0-9]+]], r[[R2]], #48
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
; CHECK: add r[[R4:[0-9]+]], r[[R2]], #32
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R4]]:128]
; CHECK: mov r[[R5:[0-9]+]], r[[R2]]
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]!
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R4]]:128]
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
; CHECK: add r[[R1:[0-9]+]], r0, #48
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
; CHECK: add r[[R1:[0-9]+]], r0, #32
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
; CHECK: mov r[[PTR:[0-9]+]], r{{[0-9]+}}
; CHECK: mov r[[NOTALIGNED:[0-9]+]], sp
; CHECK: add r[[NOTALIGNED]], r[[NOTALIGNED]], #32
; CHECK: add r[[PTR]], r[[PTR]], #32
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[NOTALIGNED]]:128]
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[NOTALIGNED]]:128]
entry:
%retval = alloca <16 x float>, align 64
%0 = load <16 x float>, <16 x float>* @T3_retval, align 16
store <16 x float> %0, <16 x float>* %retval
%1 = load <16 x float>, <16 x float>* %retval
store <16 x float> %1, <16 x float>* %agg.result, align 16
%a1 = bitcast <16 x float>* %retval to float*
%a2 = getelementptr inbounds float, float* %a1, i64 8
%a3 = bitcast float* %a2 to <4 x float>*

%b1 = bitcast <16 x float>* %agg.result to float*
%b2 = getelementptr inbounds float, float* %b1, i64 8
%b3 = bitcast float* %b2 to <4 x float>*

%0 = load <4 x float>, <4 x float>* %a3, align 16
%1 = load <4 x float>, <4 x float>* %b3, align 16
store <4 x float> %0, <4 x float>* %b3, align 16
store <4 x float> %1, <4 x float>* %a3, align 16
ret void
}

define void @test2(<16 x float>* noalias sret(<16 x float>) %agg.result) nounwind ssp {
entry:
; CHECK-LABEL: test2:
; CHECK: ldr r[[R1:[0-9]+]], [pc, r[[R1]]]
; CHECK: add r[[R2:[0-9]+]], r[[R1]], #48
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
; CHECK: add r[[R2:[0-9]+]], r[[R1]], #32
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]!
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
; CHECK: mov r[[R1:[0-9]+]], sp
; CHECK: orr r[[R2:[0-9]+]], r[[R1]], #16
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
; CHECK: mov r[[R3:[0-9]+]], #32
; CHECK: mov r[[R9:[0-9]+]], r[[R1]]
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R9]]:128], r[[R3]]
; CHECK: mov r[[R3:[0-9]+]], r[[R9]]
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]!
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R9]]:128]
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
; CHECK: add r[[R1:[0-9]+]], r0, #48
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
; CHECK: add r[[R1:[0-9]+]], r0, #32
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
; CHECK: mov r[[PTR:[0-9]+]], r{{[0-9]+}}
; CHECK: mov r[[ALIGNED:[0-9]+]], sp
; CHECK: orr r[[ALIGNED]], r[[ALIGNED]], #32
; CHECK: add r[[PTR]], r[[PTR]], #32
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[ALIGNED]]:128]
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[ALIGNED]]:128]
entry:
%retval = alloca <16 x float>, align 64
%a1 = bitcast <16 x float>* %retval to float*
%a2 = getelementptr inbounds float, float* %a1, i64 8
%a3 = bitcast float* %a2 to <4 x float>*

%b1 = bitcast <16 x float>* %agg.result to float*
%b2 = getelementptr inbounds float, float* %b1, i64 8
%b3 = bitcast float* %b2 to <4 x float>*

%retval = alloca <16 x float>, align 64
%0 = load <16 x float>, <16 x float>* @T3_retval, align 16
store <16 x float> %0, <16 x float>* %retval
%1 = load <16 x float>, <16 x float>* %retval
store <16 x float> %1, <16 x float>* %agg.result, align 16
%0 = load <4 x float>, <4 x float>* %a3, align 16
%1 = load <4 x float>, <4 x float>* %b3, align 16
store <4 x float> %0, <4 x float>* %b3, align 16
store <4 x float> %1, <4 x float>* %a3, align 16
ret void
}
Loading

0 comments on commit dc8a41d

Please sign in to comment.