Skip to content

Commit e1d5787

Browse files
committed
[VectorCombine] Fold chain of (scalar load)->ext->ext to load->ext.
Add a new combine that folds a chain of (scalar load)->ext->ext (with shuffles/casts/inserts in between) to a single vector load and wide extend. This makes the IR simpler to analyze and to process, while the backend can still decide to break them up. Code like that comes from code written with vector intrinsics. Some examples of real-world use are in https://github.com/ARM-software/astc-encoder/.
1 parent 9c74c92 commit e1d5787

File tree

2 files changed

+67
-48
lines changed

2 files changed

+67
-48
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ class VectorCombine {
127127
bool foldShuffleOfShuffles(Instruction &I);
128128
bool foldShuffleOfIntrinsics(Instruction &I);
129129
bool foldShuffleToIdentity(Instruction &I);
130+
bool foldShuffleExtExtracts(Instruction &I);
130131
bool foldShuffleFromReductions(Instruction &I);
131132
bool foldCastFromReductions(Instruction &I);
132133
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
@@ -2777,6 +2778,55 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
27772778
return true;
27782779
}
27792780

2781+
bool VectorCombine::foldShuffleExtExtracts(Instruction &I) {
2782+
// Try to fold vector zero- and sign-extends split across multiple operations
2783+
// into a single extend, removing redundant inserts and shuffles.
2784+
2785+
// Check if we have an extended shuffle that selects the first vector, which
2786+
// itself is another extend fed by a load.
2787+
Instruction *L;
2788+
if (!match(
2789+
&I,
2790+
m_OneUse(m_Shuffle(
2791+
m_OneUse(m_ZExtOrSExt(m_OneUse(m_BitCast(m_OneUse(m_InsertElt(
2792+
m_Value(), m_OneUse(m_Instruction(L)), m_SpecificInt(0))))))),
2793+
m_Value()))) ||
2794+
!cast<ShuffleVectorInst>(&I)->isIdentityWithExtract() ||
2795+
!isa<LoadInst>(L))
2796+
return false;
2797+
auto *InnerExt = cast<Instruction>(I.getOperand(0));
2798+
auto *OuterExt = dyn_cast<Instruction>(*I.user_begin());
2799+
if (!isa<SExtInst, ZExtInst>(OuterExt))
2800+
return false;
2801+
2802+
// If the inner extend is a sign extend and the outer one isnt (i.e. a
2803+
// zero-extend), don't fold. If the first one is zero-extend, it doesn't
2804+
// matter if the second one is a sign- or zero-extend.
2805+
if (isa<SExtInst>(InnerExt) && !isa<SExtInst>(OuterExt))
2806+
return false;
2807+
2808+
// Don't try to convert the load if it has an odd size.
2809+
if (!DL->typeSizeEqualsStoreSize(L->getType()))
2810+
return false;
2811+
auto *DstTy = cast<FixedVectorType>(OuterExt->getType());
2812+
auto *SrcTy =
2813+
FixedVectorType::get(InnerExt->getOperand(0)->getType()->getScalarType(),
2814+
DstTy->getNumElements());
2815+
if (DL->getTypeStoreSize(SrcTy) != DL->getTypeStoreSize(L->getType()))
2816+
return false;
2817+
2818+
// Convert to a vector load feeding a single wide extend.
2819+
Builder.SetInsertPoint(*L->getInsertionPointAfterDef());
2820+
auto *NewLoad = cast<LoadInst>(
2821+
Builder.CreateLoad(SrcTy, L->getOperand(0), L->getName() + ".vec"));
2822+
auto *NewExt = isa<ZExtInst>(InnerExt) ? Builder.CreateZExt(NewLoad, DstTy)
2823+
: Builder.CreateSExt(NewLoad, DstTy);
2824+
OuterExt->replaceAllUsesWith(NewExt);
2825+
replaceValue(*OuterExt, *NewExt);
2826+
Worklist.pushValue(NewLoad);
2827+
return true;
2828+
}
2829+
27802830
/// Given a commutative reduction, the order of the input lanes does not alter
27812831
/// the results. We can use this to remove certain shuffles feeding the
27822832
/// reduction, removing the need to shuffle at all.
@@ -3551,6 +3601,7 @@ bool VectorCombine::run() {
35513601
break;
35523602
case Instruction::ShuffleVector:
35533603
MadeChange |= widenSubvectorLoad(I);
3604+
MadeChange |= foldShuffleExtExtracts(I);
35543605
break;
35553606
default:
35563607
break;

llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll

Lines changed: 16 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,8 @@ define <4 x i32> @load_i32_zext_to_v4i32(ptr %di) {
1111
; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32(
1212
; CHECK-SAME: ptr [[DI:%.*]]) {
1313
; CHECK-NEXT: [[ENTRY:.*:]]
14-
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4
15-
; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
16-
; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
17-
; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
18-
; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
19-
; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
14+
; CHECK-NEXT: [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
15+
; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i8> [[L_VEC]] to <4 x i32>
2016
; CHECK-NEXT: ret <4 x i32> [[EXT_2]]
2117
;
2218
entry:
@@ -33,12 +29,8 @@ define <4 x i32> @load_i32_zext_to_v4i32_both_nneg(ptr %di) {
3329
; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32_both_nneg(
3430
; CHECK-SAME: ptr [[DI:%.*]]) {
3531
; CHECK-NEXT: [[ENTRY:.*:]]
36-
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4
37-
; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
38-
; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
39-
; CHECK-NEXT: [[E_1:%.*]] = zext nneg <8 x i8> [[VEC_BC]] to <8 x i16>
40-
; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
41-
; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
32+
; CHECK-NEXT: [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
33+
; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i8> [[L_VEC]] to <4 x i32>
4234
; CHECK-NEXT: ret <4 x i32> [[EXT_2]]
4335
;
4436
entry:
@@ -55,13 +47,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_clobber_after_load(ptr %di) {
5547
; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32_clobber_after_load(
5648
; CHECK-SAME: ptr [[DI:%.*]]) {
5749
; CHECK-NEXT: [[ENTRY:.*:]]
58-
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4
50+
; CHECK-NEXT: [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
51+
; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i8> [[L_VEC]] to <4 x i32>
5952
; CHECK-NEXT: call void @use.i32(i32 0)
60-
; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
61-
; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
62-
; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
63-
; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
64-
; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
6553
; CHECK-NEXT: ret <4 x i32> [[EXT_2]]
6654
;
6755
entry:
@@ -221,12 +209,8 @@ define <8 x i32> @load_i64_zext_to_v8i32(ptr %di) {
221209
; CHECK-LABEL: define <8 x i32> @load_i64_zext_to_v8i32(
222210
; CHECK-SAME: ptr [[DI:%.*]]) {
223211
; CHECK-NEXT: [[ENTRY:.*:]]
224-
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[DI]], align 8
225-
; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[L]], i64 0
226-
; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i64> [[VEC_INS]] to <16 x i8>
227-
; CHECK-NEXT: [[EXT_1:%.*]] = zext <16 x i8> [[VEC_BC]] to <16 x i16>
228-
; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i16> [[EXT_1]], <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
229-
; CHECK-NEXT: [[OUTER_EXT:%.*]] = zext nneg <8 x i16> [[VEC_SHUFFLE]] to <8 x i32>
212+
; CHECK-NEXT: [[L_VEC:%.*]] = load <8 x i8>, ptr [[DI]], align 8
213+
; CHECK-NEXT: [[OUTER_EXT:%.*]] = zext <8 x i8> [[L_VEC]] to <8 x i32>
230214
; CHECK-NEXT: ret <8 x i32> [[OUTER_EXT]]
231215
;
232216
entry:
@@ -243,12 +227,8 @@ define <3 x i32> @load_i24_zext_to_v3i32(ptr %di) {
243227
; CHECK-LABEL: define <3 x i32> @load_i24_zext_to_v3i32(
244228
; CHECK-SAME: ptr [[DI:%.*]]) {
245229
; CHECK-NEXT: [[ENTRY:.*:]]
246-
; CHECK-NEXT: [[L:%.*]] = load i24, ptr [[DI]], align 4
247-
; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i24> <i24 poison, i24 0>, i24 [[L]], i64 0
248-
; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i24> [[VEC_INS]] to <6 x i8>
249-
; CHECK-NEXT: [[EXT_1:%.*]] = zext <6 x i8> [[VEC_BC]] to <6 x i16>
250-
; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i16> [[EXT_1]], <6 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
251-
; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <3 x i16> [[VEC_SHUFFLE]] to <3 x i32>
230+
; CHECK-NEXT: [[L_VEC:%.*]] = load <3 x i8>, ptr [[DI]], align 4
231+
; CHECK-NEXT: [[EXT_2:%.*]] = zext <3 x i8> [[L_VEC]] to <3 x i32>
252232
; CHECK-NEXT: ret <3 x i32> [[EXT_2]]
253233
;
254234
entry:
@@ -353,12 +333,8 @@ define <4 x i32> @load_i32_sext_to_v4i32(ptr %di) {
353333
; CHECK-LABEL: define <4 x i32> @load_i32_sext_to_v4i32(
354334
; CHECK-SAME: ptr [[DI:%.*]]) {
355335
; CHECK-NEXT: [[ENTRY:.*:]]
356-
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4
357-
; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
358-
; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
359-
; CHECK-NEXT: [[E_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16>
360-
; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
361-
; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
336+
; CHECK-NEXT: [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
337+
; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i8> [[L_VEC]] to <4 x i32>
362338
; CHECK-NEXT: ret <4 x i32> [[EXT_2]]
363339
;
364340
entry:
@@ -375,12 +351,8 @@ define <8 x i32> @load_i64_sext_to_v8i32(ptr %di) {
375351
; CHECK-LABEL: define <8 x i32> @load_i64_sext_to_v8i32(
376352
; CHECK-SAME: ptr [[DI:%.*]]) {
377353
; CHECK-NEXT: [[ENTRY:.*:]]
378-
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[DI]], align 8
379-
; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[L]], i64 0
380-
; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i64> [[VEC_INS]] to <16 x i8>
381-
; CHECK-NEXT: [[EXT_1:%.*]] = sext <16 x i8> [[VEC_BC]] to <16 x i16>
382-
; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i16> [[EXT_1]], <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
383-
; CHECK-NEXT: [[OUTER_EXT:%.*]] = sext <8 x i16> [[VEC_SHUFFLE]] to <8 x i32>
354+
; CHECK-NEXT: [[L_VEC:%.*]] = load <8 x i8>, ptr [[DI]], align 8
355+
; CHECK-NEXT: [[OUTER_EXT:%.*]] = sext <8 x i8> [[L_VEC]] to <8 x i32>
384356
; CHECK-NEXT: ret <8 x i32> [[OUTER_EXT]]
385357
;
386358
entry:
@@ -397,12 +369,8 @@ define <3 x i32> @load_i24_sext_to_v3i32(ptr %di) {
397369
; CHECK-LABEL: define <3 x i32> @load_i24_sext_to_v3i32(
398370
; CHECK-SAME: ptr [[DI:%.*]]) {
399371
; CHECK-NEXT: [[ENTRY:.*:]]
400-
; CHECK-NEXT: [[L:%.*]] = load i24, ptr [[DI]], align 4
401-
; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i24> <i24 poison, i24 0>, i24 [[L]], i64 0
402-
; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i24> [[VEC_INS]] to <6 x i8>
403-
; CHECK-NEXT: [[EXT_1:%.*]] = sext <6 x i8> [[VEC_BC]] to <6 x i16>
404-
; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i16> [[EXT_1]], <6 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
405-
; CHECK-NEXT: [[EXT_2:%.*]] = sext <3 x i16> [[VEC_SHUFFLE]] to <3 x i32>
372+
; CHECK-NEXT: [[L_VEC:%.*]] = load <3 x i8>, ptr [[DI]], align 4
373+
; CHECK-NEXT: [[EXT_2:%.*]] = sext <3 x i8> [[L_VEC]] to <3 x i32>
406374
; CHECK-NEXT: ret <3 x i32> [[EXT_2]]
407375
;
408376
entry:

0 commit comments

Comments
 (0)