Skip to content

Commit

Permalink
Introduce target hook for optimizing register copies
Browse files Browse the repository at this point in the history
Allow a target to do something other than search for copies
that will avoid cross register bank copies.

Implement for SI by only rewriting the most basic copies,
so it should look through anything like a subregister extract.

I'm not entirely satisified with this because it seems like
eliminating a reg_sequence that isn't fully used should work
generically for all targets without them having to override
something. However, it seems to be tricky to have a simple
implementation of this without rewriting to invalid  kinds
of subregister copies on some targets.

I'm not sure if there is currently a generic way to easily check
if a subregister index would be valid for the current use.
The current set of TargetRegisterInfo::get*Class functions don't
quite behave like I would expect (e.g. getSubClassWithSubReg
returns the maximal register class rather than the minimal), so
I'm not sure how to make the generic test keep searching if
SrcRC:SrcSubReg is a valid replacement for DefRC:DefSubReg. Making
the default implementation to check for simple copies breaks
a variety of ARM and x86 tests by producing illegal subregister uses.

The ARM tests are not actually changed since it should still be using
the same sharesSameRegisterFile implementation, this just relaxes
them to not check for specific registers.

llvm-svn: 248478
  • Loading branch information
arsenm committed Sep 24, 2015
1 parent e068f9a commit 68d9386
Show file tree
Hide file tree
Showing 14 changed files with 281 additions and 158 deletions.
9 changes: 9 additions & 0 deletions llvm/include/llvm/Target/TargetRegisterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,15 @@ class TargetRegisterInfo : public MCRegisterInfo {
getMatchingSuperRegClass(const TargetRegisterClass *A,
const TargetRegisterClass *B, unsigned Idx) const;

// For a copy-like instruction that defines a register of class DefRC with
// subreg index DefSubReg, reading from another source with class SrcRC and
// subregister SrcSubReg return true if this is a preferrable copy
// instruction or an earlier use should be used.
virtual bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
unsigned DefSubReg,
const TargetRegisterClass *SrcRC,
unsigned SrcSubReg) const;

/// getSubClassWithSubReg - Returns the largest legal sub-class of RC that
/// supports the sub-register index Idx.
/// If no such sub-class exists, return NULL.
Expand Down
36 changes: 2 additions & 34 deletions llvm/lib/CodeGen/PeepholeOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -577,36 +577,6 @@ bool PeepholeOptimizer::optimizeCondBranch(MachineInstr *MI) {
return TII->optimizeCondBranch(MI);
}

/// \brief Check if the registers defined by the pair (RegisterClass, SubReg)
/// share the same register file.
static bool shareSameRegisterFile(const TargetRegisterInfo &TRI,
const TargetRegisterClass *DefRC,
unsigned DefSubReg,
const TargetRegisterClass *SrcRC,
unsigned SrcSubReg) {
// Same register class.
if (DefRC == SrcRC)
return true;

// Both operands are sub registers. Check if they share a register class.
unsigned SrcIdx, DefIdx;
if (SrcSubReg && DefSubReg)
return TRI.getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg,
SrcIdx, DefIdx) != nullptr;
// At most one of the register is a sub register, make it Src to avoid
// duplicating the test.
if (!SrcSubReg) {
std::swap(DefSubReg, SrcSubReg);
std::swap(DefRC, SrcRC);
}

// One of the register is a sub register, check if we can get a superclass.
if (SrcSubReg)
return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != nullptr;
// Plain copy.
return TRI.getCommonSubClass(DefRC, SrcRC) != nullptr;
}

/// \brief Try to find the next source that share the same register file
/// for the value defined by \p Reg and \p SubReg.
/// When true is returned, the \p RewriteMap can be used by the client to
Expand Down Expand Up @@ -687,10 +657,8 @@ bool PeepholeOptimizer::findNextSource(unsigned Reg, unsigned SubReg,
return false;

const TargetRegisterClass *SrcRC = MRI->getRegClass(CurSrcPair.Reg);

// If this source does not incur a cross register bank copy, use it.
ShouldRewrite = shareSameRegisterFile(*TRI, DefRC, SubReg, SrcRC,
CurSrcPair.SubReg);
ShouldRewrite = TRI->shouldRewriteCopySrc(DefRC, SubReg, SrcRC,
CurSrcPair.SubReg);
} while (!ShouldRewrite);

// Continue looking for new sources...
Expand Down
41 changes: 41 additions & 0 deletions llvm/lib/CodeGen/TargetRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,47 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA,
return BestRC;
}

/// \brief Check if the registers defined by the pair (RegisterClass, SubReg)
/// share the same register file.
static bool shareSameRegisterFile(const TargetRegisterInfo &TRI,
const TargetRegisterClass *DefRC,
unsigned DefSubReg,
const TargetRegisterClass *SrcRC,
unsigned SrcSubReg) {
// Same register class.
if (DefRC == SrcRC)
return true;

// Both operands are sub registers. Check if they share a register class.
unsigned SrcIdx, DefIdx;
if (SrcSubReg && DefSubReg) {
return TRI.getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg,
SrcIdx, DefIdx) != nullptr;
}

// At most one of the register is a sub register, make it Src to avoid
// duplicating the test.
if (!SrcSubReg) {
std::swap(DefSubReg, SrcSubReg);
std::swap(DefRC, SrcRC);
}

// One of the register is a sub register, check if we can get a superclass.
if (SrcSubReg)
return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != nullptr;

// Plain copy.
return TRI.getCommonSubClass(DefRC, SrcRC) != nullptr;
}

bool TargetRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
unsigned DefSubReg,
const TargetRegisterClass *SrcRC,
unsigned SrcSubReg) const {
// If this source does not incur a cross register bank copy, use it.
return shareSameRegisterFile(*this, DefRC, DefSubReg, SrcRC, SrcSubReg);
}

// Compute target-independent register allocator hints to help eliminate copies.
void
TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg,
Expand Down
24 changes: 24 additions & 0 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,30 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
}
}

bool SIRegisterInfo::shouldRewriteCopySrc(
const TargetRegisterClass *DefRC,
unsigned DefSubReg,
const TargetRegisterClass *SrcRC,
unsigned SrcSubReg) const {
// We want to prefer the smallest register class possible, so we don't want to
// stop and rewrite on anything that looks like a subregister
// extract. Operations mostly don't care about the super register class, so we
// only want to stop on the most basic of copies between the smae register
// class.
//
// e.g. if we have something like
// vreg0 = ...
// vreg1 = ...
// vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2
// vreg3 = COPY vreg2, sub0
//
// We want to look through the COPY to find:
// => vreg3 = COPY vreg0

// Plain copy.
return getCommonSubClass(DefRC, SrcRC) != nullptr;
}

unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
const TargetRegisterClass *SubRC,
unsigned Channel) const {
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
unsigned SubIdx) const;

bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
unsigned DefSubReg,
const TargetRegisterClass *SrcRC,
unsigned SrcSubReg) const override;

/// \p Channel This is the register channel (e.g. a value from 0-16), not the
/// SubReg index.
/// \returns The sub-register of Reg that is in Channel.
Expand Down
17 changes: 15 additions & 2 deletions llvm/test/CodeGen/AMDGPU/and.ll
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,24 @@ endif:
ret void
}

; FIXME: and 0 should be replaced witht copy
; FUNC-LABEL: {{^}}v_and_constant_i64:
; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207
; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}}
; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], {{v[0-9]+}}
; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], {{v[0-9]+}}
; SI: buffer_store_dwordx2
define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
%a = load i64, i64 addrspace(1)* %aptr, align 8
%and = and i64 %a, 1231231234567
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
}

; FIXME: Should replace and 0
; FUNC-LABEL: {{^}}v_and_i64_32_bit_constant:
; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
%a = load i64, i64 addrspace(1)* %aptr, align 8
%and = and i64 %a, 1234567
store i64 %and, i64 addrspace(1)* %out, align 8
Expand Down
25 changes: 17 additions & 8 deletions llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,11 @@ define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
ret void
}

; FIXME: Shuffling to new superregister
; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align4:
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Y:[0-9]+]]:[[REG_X:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Y:[0-9]+]], v[[REG_Y]]
; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Z:[0-9]+]], v[[REG_Z]]
; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[COPY_REG_Z]], v[[REG_X]]
; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[COPY_REG_Y]]
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Z:[0-9]+]]:[[REG_W:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]]
; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[REG_Y]]
; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]]
; CI: buffer_store_dword v[[ADD2]]
; CI: s_endpgm
Expand All @@ -64,11 +61,15 @@ define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
ret void
}


; FIXME: the v_lshl_b64 x, x, 32 is a bad way of doing a copy

; CI-LABEL: {{^}}simple_read2_v3f32_superreg_align4:
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
; CI-DAG: ds_read_b32 v[[REG_Z:[0-9]+]], v{{[0-9]+}} offset:8{{$}}
; CI: v_lshr_b64 v{{\[}}[[Y_COPY:[0-9]+]]:{{[0-9]+\]}}, v{{\[}}[[REG_X]]:[[REG_Y]]{{\]}}, 32
; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]]
; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]]
; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[Y_COPY]], v[[ADD0]]
; CI: buffer_store_dword v[[ADD1]]
; CI: s_endpgm
define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
Expand Down Expand Up @@ -140,13 +141,21 @@ define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {

; CI-LABEL: {{^}}simple_read2_v16f32_superreg:
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:15 offset1:14{{$}}
; CI-NOT: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:13 offset1:12{{$}}
; CI-NOT: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:10{{$}}
; CI-NOT: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:9 offset1:8{{$}}
; CI-NOT: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}}
; CI-NOT: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}}
; CI-NOT: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
; CI-NOT: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
; CI-NOT: v_mov_b32

; CI: s_waitcnt lgkmcnt(0)
; CI: buffer_store_dword
Expand Down
67 changes: 33 additions & 34 deletions llvm/test/CodeGen/AMDGPU/half.ll
Original file line number Diff line number Diff line change
Expand Up @@ -391,13 +391,12 @@ define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2
ret void
}

; FIXME: Shouldn't do 4th conversion
; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16:
; GCN: buffer_load_dwordx4
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN-NOT: v_cvt_f16_f32_e32
; GCN: buffer_store_short
; GCN: buffer_store_dword
; GCN: s_endpgm
Expand Down Expand Up @@ -476,38 +475,38 @@ define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: buffer_store_short
; GCN: buffer_store_short
; GCN: buffer_store_short
; GCN: buffer_store_short
; GCN: buffer_store_short
; GCN: buffer_store_short
; GCN: buffer_store_short
; GCN: buffer_store_short
; GCN: buffer_store_short
; GCN: buffer_store_short
; GCN: buffer_store_short
; GCN: buffer_store_short
; GCN: buffer_store_short
; GCN: buffer_store_short
; GCN: buffer_store_short
; GCN: buffer_store_short
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: buffer_store_short
; GCN-DAG: buffer_store_short
; GCN-DAG: buffer_store_short
; GCN-DAG: buffer_store_short
; GCN-DAG: buffer_store_short
; GCN-DAG: buffer_store_short
; GCN-DAG: buffer_store_short
; GCN-DAG: buffer_store_short
; GCN-DAG: buffer_store_short
; GCN-DAG: buffer_store_short
; GCN-DAG: buffer_store_short
; GCN-DAG: buffer_store_short
; GCN-DAG: buffer_store_short
; GCN-DAG: buffer_store_short
; GCN-DAG: buffer_store_short
; GCN-DAG: buffer_store_short
; GCN: s_endpgm
define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
%val = load <16 x float>, <16 x float> addrspace(1)* %in
Expand Down
5 changes: 1 addition & 4 deletions llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,9 @@ define void @round_f64(double addrspace(1)* %out, double %x) #0 {
; SI-DAG: v_cmp_eq_i32

; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff
; SI-DAG: v_cmp_gt_i32_e64
; SI-DAG: v_cmp_gt_i32_e32
; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]]

; SI-DAG: v_cmp_gt_i32_e64


; SI: buffer_store_dwordx2
; SI: s_endpgm
define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
Expand Down
36 changes: 36 additions & 0 deletions llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCN %s

; Check that when mubuf addr64 instruction is handled in moveToVALU
; from the pointer, dead register writes are not emitted.

; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32

; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add:
; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}

; GCN-NOT: v_mov_b32
; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
; GCN-NEXT: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
; GCN-NOT: v_mov_b32

; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]]
; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]
; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}},

define void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
bb:
%tmp = icmp sgt i32 %arg3, 0
br i1 %tmp, label %bb4, label %bb17

bb4:
%tmp14 = load volatile i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %ptrarg
%tmp15 = getelementptr inbounds i8, i8 addrspace(1)* %tmp14, i64 %arg1
%tmp16 = load volatile i8, i8 addrspace(1)* %tmp15
br label %bb17

bb17:
ret void
}

attributes #0 = { nounwind }
Loading

0 comments on commit 68d9386

Please sign in to comment.