Skip to content

Commit 4d80f19

Browse files
committed
[X86] Change vXi8 MULHU lowering to unpack high and low half of lanes instead of extracting and concating low and high half registers.
This reduces the number of shuffle operations that need to be done. The splitting strategy requires the shuffle unit for the extraction and the extension. With the unpack strategy the unpacks accomplish a splitting and extending in one operation. llvm-svn: 348019
1 parent 8191307 commit 4d80f19

File tree

4 files changed

+144
-164
lines changed

4 files changed

+144
-164
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 47 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -23656,69 +23656,62 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
2365623656
// and then ashr/lshr the upper bits down to the lower bits before multiply.
2365723657
unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2365823658

23659-
// For 512-bit vectors, split into 256-bit vectors to allow the
23659+
if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
23660+
(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
23661+
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
23662+
SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
23663+
SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
23664+
SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
23665+
Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
23666+
return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
23667+
}
23668+
23669+
// For signed 512-bit vectors, split into 256-bit vectors to allow the
2366023670
// sign-extension to occur.
23661-
if (VT == MVT::v64i8)
23671+
if (VT == MVT::v64i8 && IsSigned)
2366223672
return split512IntArith(Op, DAG);
2366323673

23664-
// AVX2 implementations - extend xmm subvectors to ymm.
23665-
if (Subtarget.hasInt256()) {
23674+
// Signed AVX2 implementation - extend xmm subvectors to ymm.
23675+
if (VT == MVT::v32i8 && IsSigned) {
2366623676
SDValue Lo = DAG.getIntPtrConstant(0, dl);
2366723677
SDValue Hi = DAG.getIntPtrConstant(NumElts / 2, dl);
2366823678

23669-
if (VT == MVT::v32i8) {
23670-
if (Subtarget.canExtendTo512BW()) {
23671-
MVT ExVT = MVT::v32i16;
23672-
SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
23673-
SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
23674-
SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
23675-
Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
23676-
return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
23677-
}
23678-
MVT ExVT = MVT::v16i16;
23679-
SDValue ALo = extract128BitVector(A, 0, DAG, dl);
23680-
SDValue BLo = extract128BitVector(B, 0, DAG, dl);
23681-
SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl);
23682-
SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl);
23683-
ALo = DAG.getNode(ExAVX, dl, ExVT, ALo);
23684-
BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
23685-
AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
23686-
BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
23687-
Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
23688-
Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
23689-
Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
23690-
Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);
23691-
23692-
// Bitcast back to VT and then pack all the even elements from Lo and Hi.
23693-
// Shuffle lowering should turn this into PACKUS+PERMQ
23694-
Lo = DAG.getBitcast(VT, Lo);
23695-
Hi = DAG.getBitcast(VT, Hi);
23696-
return DAG.getVectorShuffle(VT, dl, Lo, Hi,
23697-
{ 0, 2, 4, 6, 8, 10, 12, 14,
23698-
16, 18, 20, 22, 24, 26, 28, 30,
23699-
32, 34, 36, 38, 40, 42, 44, 46,
23700-
48, 50, 52, 54, 56, 58, 60, 62});
23701-
}
23702-
23703-
assert(VT == MVT::v16i8 && "Unexpected VT");
23704-
23705-
SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
23706-
SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
23707-
SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
23708-
Mul =
23709-
getTargetVShiftByConstNode(X86ISD::VSRLI, dl, MVT::v16i16, Mul, 8, DAG);
23710-
return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
23711-
}
23712-
23713-
assert(VT == MVT::v16i8 &&
23714-
"Pre-AVX2 support only supports v16i8 multiplication");
23715-
MVT ExVT = MVT::v8i16;
23679+
MVT ExVT = MVT::v16i16;
23680+
SDValue ALo = extract128BitVector(A, 0, DAG, dl);
23681+
SDValue BLo = extract128BitVector(B, 0, DAG, dl);
23682+
SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl);
23683+
SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl);
23684+
ALo = DAG.getNode(ExAVX, dl, ExVT, ALo);
23685+
BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
23686+
AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
23687+
BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
23688+
Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
23689+
Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
23690+
Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
23691+
Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);
23692+
23693+
// Bitcast back to VT and then pack all the even elements from Lo and Hi.
23694+
// Shuffle lowering should turn this into PACKUS+PERMQ
23695+
Lo = DAG.getBitcast(VT, Lo);
23696+
Hi = DAG.getBitcast(VT, Hi);
23697+
return DAG.getVectorShuffle(VT, dl, Lo, Hi,
23698+
{ 0, 2, 4, 6, 8, 10, 12, 14,
23699+
16, 18, 20, 22, 24, 26, 28, 30,
23700+
32, 34, 36, 38, 40, 42, 44, 46,
23701+
48, 50, 52, 54, 56, 58, 60, 62});
23702+
}
23703+
23704+
// For signed v16i8 and all unsigned vXi8 we will unpack the low and high
23705+
// half of each 128 bit lane to widen to a vXi16 type. Do the multiplies,
23706+
// shift the results and pack the half lane results back together.
23707+
23708+
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
2371623709
unsigned ExSSE41 = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
2371723710
: ISD::ZERO_EXTEND_VECTOR_INREG;
2371823711

2371923712
// Extract the lo parts and zero/sign extend to i16.
2372023713
SDValue ALo, BLo;
23721-
if (Subtarget.hasSSE41()) {
23714+
if (VT == MVT::v16i8 && Subtarget.hasSSE41()) {
2372223715
ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
2372323716
BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
2372423717
} else if (IsSigned) {
@@ -23737,7 +23730,7 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
2373723730

2373823731
// Extract the hi parts and zero/sign extend to i16.
2373923732
SDValue AHi, BHi;
23740-
if (Subtarget.hasSSE41()) {
23733+
if (VT == MVT::v16i8 && Subtarget.hasSSE41()) {
2374123734
const int ShufMask[] = { 8, 9, 10, 11, 12, 13, 14, 15,
2374223735
-1, -1, -1, -1, -1, -1, -1, -1};
2374323736
AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
@@ -23759,7 +23752,7 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
2375923752
}
2376023753

2376123754
// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
23762-
// pack back to v16i8.
23755+
// pack back to vXi8.
2376323756
SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
2376423757
SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
2376523758
RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);

llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,15 @@
77
define <32 x i8> @test_div7_32i8(<32 x i8> %a) {
88
; AVX256BW-LABEL: test_div7_32i8:
99
; AVX256BW: # %bb.0:
10-
; AVX256BW-NEXT: vextracti128 $1, %ymm0, %xmm1
11-
; AVX256BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
12-
; AVX256BW-NEXT: vmovdqa {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
13-
; AVX256BW-NEXT: vpmullw %ymm2, %ymm1, %ymm1
14-
; AVX256BW-NEXT: vpsrlw $8, %ymm1, %ymm1
15-
; AVX256BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
16-
; AVX256BW-NEXT: vpmullw %ymm2, %ymm3, %ymm2
10+
; AVX256BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
11+
; AVX256BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
12+
; AVX256BW-NEXT: vmovdqa {{.*#+}} ymm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
13+
; AVX256BW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
1714
; AVX256BW-NEXT: vpsrlw $8, %ymm2, %ymm2
18-
; AVX256BW-NEXT: vpackuswb %ymm1, %ymm2, %ymm1
19-
; AVX256BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
15+
; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
16+
; AVX256BW-NEXT: vpmullw %ymm3, %ymm1, %ymm1
17+
; AVX256BW-NEXT: vpsrlw $8, %ymm1, %ymm1
18+
; AVX256BW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
2019
; AVX256BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
2120
; AVX256BW-NEXT: vpsrlw $1, %ymm0, %ymm0
2221
; AVX256BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0

llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -206,16 +206,15 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
206206
;
207207
; AVX2NOBW-LABEL: test_div7_32i8:
208208
; AVX2NOBW: # %bb.0:
209-
; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
210-
; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
211-
; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
212-
; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm1, %ymm1
213-
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
214-
; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
215-
; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm3, %ymm2
209+
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
210+
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
211+
; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
212+
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
216213
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
217-
; AVX2NOBW-NEXT: vpackuswb %ymm1, %ymm2, %ymm1
218-
; AVX2NOBW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
214+
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
215+
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm1, %ymm1
216+
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
217+
; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
219218
; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
220219
; AVX2NOBW-NEXT: vpsrlw $1, %ymm0, %ymm0
221220
; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
@@ -502,16 +501,15 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
502501
;
503502
; AVX2NOBW-LABEL: test_rem7_32i8:
504503
; AVX2NOBW: # %bb.0:
505-
; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
506-
; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
507-
; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
508-
; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm1, %ymm1
509-
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
510-
; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
511-
; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm3, %ymm2
504+
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
505+
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
506+
; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
507+
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
512508
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
513-
; AVX2NOBW-NEXT: vpackuswb %ymm1, %ymm2, %ymm1
514-
; AVX2NOBW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
509+
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
510+
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm1, %ymm1
511+
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
512+
; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
515513
; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm2
516514
; AVX2NOBW-NEXT: vpsrlw $1, %ymm2, %ymm2
517515
; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2

0 commit comments

Comments
 (0)