Skip to content

Commit

Permalink
[AVX512] Remove and autoupgrade many of the broadcast intrinsics
Browse files Browse the repository at this point in the history
Summary:
This autoupgrades most of the broadcast intrinsics. They've been unused in clang for some time.

This leaves the 32x2 intrinsics because they are still used in clang.

Reviewers: RKSimon, zvi, igorb

Reviewed By: RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D36606

llvm-svn: 310725
  • Loading branch information
topperc committed Aug 11, 2017
1 parent 0f30fe9 commit 561092f
Show file tree
Hide file tree
Showing 12 changed files with 435 additions and 533 deletions.
60 changes: 0 additions & 60 deletions llvm/include/llvm/IR/IntrinsicsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -4440,66 +4440,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
Intrinsic<[llvm_v16i32_ty],
[llvm_v4i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;

def int_x86_avx512_mask_broadcastf32x4_256 :
GCCBuiltin<"__builtin_ia32_broadcastf32x4_256_mask">,
Intrinsic<[llvm_v8f32_ty],
[llvm_v4f32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;

def int_x86_avx512_mask_broadcastf32x4_512 :
GCCBuiltin<"__builtin_ia32_broadcastf32x4_512">,
Intrinsic<[llvm_v16f32_ty],
[llvm_v4f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>;

def int_x86_avx512_mask_broadcastf32x8_512 :
GCCBuiltin<"__builtin_ia32_broadcastf32x8_512_mask">,
Intrinsic<[llvm_v16f32_ty],
[llvm_v8f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>;

def int_x86_avx512_mask_broadcastf64x2_256 :
GCCBuiltin<"__builtin_ia32_broadcastf64x2_256_mask">,
Intrinsic<[llvm_v4f64_ty],
[llvm_v2f64_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;

def int_x86_avx512_mask_broadcastf64x2_512 :
GCCBuiltin<"__builtin_ia32_broadcastf64x2_512_mask">,
Intrinsic<[llvm_v8f64_ty],
[llvm_v2f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>;

def int_x86_avx512_mask_broadcastf64x4_512 :
GCCBuiltin<"__builtin_ia32_broadcastf64x4_512">,
Intrinsic<[llvm_v8f64_ty],
[llvm_v4f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>;

def int_x86_avx512_mask_broadcasti32x4_256 :
GCCBuiltin<"__builtin_ia32_broadcasti32x4_256_mask">,
Intrinsic<[llvm_v8i32_ty],
[llvm_v4i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;

def int_x86_avx512_mask_broadcasti32x4_512 :
GCCBuiltin<"__builtin_ia32_broadcasti32x4_512">,
Intrinsic<[llvm_v16i32_ty],
[llvm_v4i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;

def int_x86_avx512_mask_broadcasti32x8_512 :
GCCBuiltin<"__builtin_ia32_broadcasti32x8_512_mask">,
Intrinsic<[llvm_v16i32_ty],
[llvm_v8i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;

def int_x86_avx512_mask_broadcasti64x2_256 :
GCCBuiltin<"__builtin_ia32_broadcasti64x2_256_mask">,
Intrinsic<[llvm_v4i64_ty],
[llvm_v2i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;

def int_x86_avx512_mask_broadcasti64x2_512 :
GCCBuiltin<"__builtin_ia32_broadcasti64x2_512_mask">,
Intrinsic<[llvm_v8i64_ty],
[llvm_v2i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;

def int_x86_avx512_mask_broadcasti64x4_512 :
GCCBuiltin<"__builtin_ia32_broadcasti64x4_512">,
Intrinsic<[llvm_v8i64_ty],
[llvm_v4i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;

def int_x86_avx512_broadcastmw_512 :
GCCBuiltin<"__builtin_ia32_broadcastmw512">,
Intrinsic<[llvm_v16i32_ty], [llvm_i16_ty], [IntrNoMem]>;
Expand Down
23 changes: 23 additions & 0 deletions llvm/lib/IR/AutoUpgrade.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,14 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
Name.startswith("avx2.pblendd.") || // Added in 3.7
Name.startswith("avx.vbroadcastf128") || // Added in 4.0
Name == "avx2.vbroadcasti128" || // Added in 3.7
Name.startswith("avx512.mask.broadcastf32x4.") || // Added in 6.0
Name.startswith("avx512.mask.broadcastf64x2.") || // Added in 6.0
Name.startswith("avx512.mask.broadcasti32x4.") || // Added in 6.0
Name.startswith("avx512.mask.broadcasti64x2.") || // Added in 6.0
Name == "avx512.mask.broadcastf32x8.512" || // Added in 6.0
Name == "avx512.mask.broadcasti32x8.512" || // Added in 6.0
Name == "avx512.mask.broadcastf64x4.512" || // Added in 6.0
Name == "avx512.mask.broadcasti64x4.512" || // Added in 6.0
Name == "xop.vpcmov" || // Added in 3.8
Name == "xop.vpcmov.256" || // Added in 5.0
Name.startswith("avx512.mask.move.s") || // Added in 4.0
Expand Down Expand Up @@ -1221,6 +1229,21 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
else
Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
{ 0, 1, 2, 3, 0, 1, 2, 3 });
} else if (IsX86 && (Name.startswith("avx512.mask.broadcastf") ||
Name.startswith("avx512.mask.broadcasti"))) {
unsigned NumSrcElts =
CI->getArgOperand(0)->getType()->getVectorNumElements();
unsigned NumDstElts = CI->getType()->getVectorNumElements();

SmallVector<uint32_t, 8> ShuffleMask(NumDstElts);
for (unsigned i = 0; i != NumDstElts; ++i)
ShuffleMask[i] = i % NumSrcElts;

Rep = Builder.CreateShuffleVector(CI->getArgOperand(0),
CI->getArgOperand(0),
ShuffleMask);
Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
CI->getArgOperand(1));
} else if (IsX86 && (Name.startswith("avx2.pbroadcast") ||
Name.startswith("avx2.vbroadcast") ||
Name.startswith("avx512.pbroadcast") ||
Expand Down
17 changes: 0 additions & 17 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19908,23 +19908,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
DAG.getIntPtrConstant(0, dl));
return DAG.getBitcast(Op.getValueType(), Res);
}
case BRCST_SUBVEC_TO_VEC: {
SDValue Src = Op.getOperand(1);
SDValue Passthru = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
EVT resVT = Passthru.getValueType();
SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
DAG.getUNDEF(resVT), Src,
DAG.getIntPtrConstant(0, dl));
SDValue immVal;
if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
immVal = DAG.getConstant(0x44, dl, MVT::i8);
else
immVal = DAG.getConstant(0, dl, MVT::i8);
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
subVec, subVec, immVal),
Mask, Passthru, Subtarget, DAG);
}
case BRCST32x2_TO_VEC: {
SDValue Src = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
Expand Down
26 changes: 1 addition & 25 deletions llvm/lib/Target/X86/X86IntrinsicsInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ enum IntrinsicType : uint16_t {
FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3,
VPERM_2OP_MASK, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, BRCST32x2_TO_VEC,
COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST32x2_TO_VEC,
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
EXPAND_FROM_MEM,
TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
Expand Down Expand Up @@ -482,36 +482,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, BRCST32x2_TO_VEC,
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_256, BRCST_SUBVEC_TO_VEC,
X86ISD::SHUF128, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_512, BRCST_SUBVEC_TO_VEC,
X86ISD::SHUF128, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcastf32x8_512, BRCST_SUBVEC_TO_VEC,
X86ISD::SHUF128, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_256, BRCST_SUBVEC_TO_VEC,
X86ISD::SHUF128, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_512, BRCST_SUBVEC_TO_VEC,
X86ISD::SHUF128, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcastf64x4_512, BRCST_SUBVEC_TO_VEC,
X86ISD::SHUF128, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_128, BRCST32x2_TO_VEC,
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_256, BRCST32x2_TO_VEC,
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_512, BRCST32x2_TO_VEC,
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_256, BRCST_SUBVEC_TO_VEC,
X86ISD::SHUF128, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_512, BRCST_SUBVEC_TO_VEC,
X86ISD::SHUF128, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcasti32x8_512, BRCST_SUBVEC_TO_VEC,
X86ISD::SHUF128, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_256, BRCST_SUBVEC_TO_VEC,
X86ISD::SHUF128, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_512, BRCST_SUBVEC_TO_VEC,
X86ISD::SHUF128, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC,
X86ISD::SHUF128, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM,
Expand Down
137 changes: 137 additions & 0 deletions llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3444,3 +3444,140 @@ define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
}

declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone

declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float>, <16 x float>, i16)

define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0, <16 x float> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512:
; CHECK: ## BB#0:
; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm0, %zmm2 {%k1} {z}
; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq

%res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 -1)
%res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
%res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> zeroinitializer, i16 %mask)
%res4 = fadd <16 x float> %res1, %res2
%res5 = fadd <16 x float> %res3, %res4
ret <16 x float> %res5
}

define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512_load(<4 x float>* %x0ptr, <16 x float> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512_load:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%x0 = load <4 x float>, <4 x float>* %x0ptr
%res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
ret <16 x float> %res
}

declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x double>, i8)

define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512:
; CHECK: ## BB#0:
; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1
; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq

%res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1)
%res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
%res3 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> zeroinitializer, i8 %mask)
%res4 = fadd <8 x double> %res1, %res2
%res5 = fadd <8 x double> %res3, %res4
ret <8 x double> %res5
}

define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512_load(<4 x double>* %x0ptr, <8 x double> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512_load:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq

%x0 = load <4 x double>, <4 x double>* %x0ptr
%res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
ret <8 x double> %res
}

declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32>, <16 x i32>, i16)

define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512:
; CHECK: ## BB#0:
; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z}
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq

%res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
%res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
%res3 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
%res4 = add <16 x i32> %res1, %res2
%res5 = add <16 x i32> %res3, %res4
ret <16 x i32> %res5
}

define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512_load(<4 x i32>* %x0ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512_load:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq

%x0 = load <4 x i32>, <4 x i32>* %x0ptr
%res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
ret <16 x i32> %res
}

declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64>, <8 x i64>, i8)

define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512:
; CHECK: ## BB#0:
; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1
; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq

%res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
%res3 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask)
%res4 = add <8 x i64> %res1, %res2
%res5 = add <8 x i64> %res3, %res4
ret <8 x i64> %res5
}

define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512_load(<4 x i64>* %x0ptr, <8 x i64> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512_load:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq

%x0 = load <4 x i64>, <4 x i64>* %x0ptr
%res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
ret <8 x i64> %res
}
Loading

0 comments on commit 561092f

Please sign in to comment.