Skip to content

Commit 07c085a

Browse files
authored
[VPlan] Add narrowToSingleScalarRecipe transform. (#139150)
Add a new convertToUniformRecipes transform which uses VPlan-based uniformity analysis to determine if wide recipes and replicate recipes can be converted to uniform recipes. There are a few places where we ad-hoc convert recipes to uniform recipes, which this transform will eventually replace. There are a few more generalizations required to do so which I plan to do as follow-ups. By converting the recipes to uniform recipes, we effectively materialize the information from the VPlan-based analysis. Note that there is one regression at the moment in SystemZ/pr47665.ll due to trivial constant folding opportunities in the input IR. This will be fixed by VPlan-based constant folding (#125365) PR: #139150
1 parent 3ccb15d commit 07c085a

File tree

4 files changed

+59
-27
lines changed

4 files changed

+59
-27
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1085,6 +1085,42 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
10851085
}
10861086
}
10871087

1088+
static void narrowToSingleScalarRecipes(VPlan &Plan) {
1089+
if (Plan.hasScalarVFOnly())
1090+
return;
1091+
1092+
// Try to narrow wide and replicating recipes to single scalar recipes,
1093+
// based on VPlan analysis. Only process blocks in the loop region for now,
1094+
// without traversing into nested regions, as recipes in replicate regions
1095+
// cannot be converted yet.
1096+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1097+
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
1098+
for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1099+
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1100+
if (!RepR && !isa<VPWidenRecipe>(&R))
1101+
continue;
1102+
if (RepR && RepR->isSingleScalar())
1103+
continue;
1104+
1105+
auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
1106+
// Skip recipes that aren't single scalars or don't have only their
1107+
// scalar results used. In the latter case, we would introduce extra
1108+
// broadcasts.
1109+
if (!vputils::isSingleScalar(RepOrWidenR) ||
1110+
any_of(RepOrWidenR->users(), [RepOrWidenR](VPUser *U) {
1111+
return !U->usesScalars(RepOrWidenR);
1112+
}))
1113+
continue;
1114+
1115+
auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
1116+
RepOrWidenR->operands(),
1117+
true /*IsSingleScalar*/);
1118+
Clone->insertBefore(RepOrWidenR);
1119+
RepOrWidenR->replaceAllUsesWith(Clone);
1120+
}
1121+
}
1122+
}
1123+
10881124
/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
10891125
/// to make sure the masks are simplified.
10901126
static void simplifyBlends(VPlan &Plan) {
@@ -1779,6 +1815,7 @@ void VPlanTransforms::optimize(VPlan &Plan) {
17791815
runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());
17801816
runPass(simplifyBlends, Plan);
17811817
runPass(removeDeadRecipes, Plan);
1818+
runPass(narrowToSingleScalarRecipes, Plan);
17821819
runPass(legalizeAndOptimizeInductions, Plan);
17831820
runPass(removeRedundantExpandSCEVRecipes, Plan);
17841821
runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());

llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,86 +7,87 @@ define void @test(ptr %p, i40 %a) {
77
; CHECK-NEXT: entry:
88
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
99
; CHECK: vector.ph:
10+
; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i1 true, false
1011
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1112
; CHECK: vector.body:
1213
; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
1314
; CHECK: pred.store.if:
14-
; CHECK-NEXT: store i1 false, ptr [[P]], align 1
15+
; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1
1516
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
1617
; CHECK: pred.store.continue:
1718
; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
1819
; CHECK: pred.store.if1:
19-
; CHECK-NEXT: store i1 false, ptr [[P]], align 1
20+
; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1
2021
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]]
2122
; CHECK: pred.store.continue2:
2223
; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
2324
; CHECK: pred.store.if3:
24-
; CHECK-NEXT: store i1 false, ptr [[P]], align 1
25+
; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1
2526
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]]
2627
; CHECK: pred.store.continue4:
2728
; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
2829
; CHECK: pred.store.if5:
29-
; CHECK-NEXT: store i1 false, ptr [[P]], align 1
30+
; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1
3031
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
3132
; CHECK: pred.store.continue6:
3233
; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
3334
; CHECK: pred.store.if7:
34-
; CHECK-NEXT: store i1 false, ptr [[P]], align 1
35+
; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1
3536
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]]
3637
; CHECK: pred.store.continue8:
3738
; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
3839
; CHECK: pred.store.if9:
39-
; CHECK-NEXT: store i1 false, ptr [[P]], align 1
40+
; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1
4041
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]]
4142
; CHECK: pred.store.continue10:
4243
; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
4344
; CHECK: pred.store.if11:
44-
; CHECK-NEXT: store i1 false, ptr [[P]], align 1
45+
; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1
4546
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]]
4647
; CHECK: pred.store.continue12:
4748
; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
4849
; CHECK: pred.store.if13:
49-
; CHECK-NEXT: store i1 false, ptr [[P]], align 1
50+
; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1
5051
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]]
5152
; CHECK: pred.store.continue14:
5253
; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
5354
; CHECK: pred.store.if15:
54-
; CHECK-NEXT: store i1 false, ptr [[P]], align 1
55+
; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1
5556
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE16]]
5657
; CHECK: pred.store.continue16:
5758
; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
5859
; CHECK: pred.store.if17:
59-
; CHECK-NEXT: store i1 false, ptr [[P]], align 1
60+
; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1
6061
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE18]]
6162
; CHECK: pred.store.continue18:
6263
; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
6364
; CHECK: pred.store.if19:
64-
; CHECK-NEXT: store i1 false, ptr [[P]], align 1
65+
; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1
6566
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE20]]
6667
; CHECK: pred.store.continue20:
6768
; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
6869
; CHECK: pred.store.if21:
69-
; CHECK-NEXT: store i1 false, ptr [[P]], align 1
70+
; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1
7071
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE22]]
7172
; CHECK: pred.store.continue22:
7273
; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
7374
; CHECK: pred.store.if23:
74-
; CHECK-NEXT: store i1 false, ptr [[P]], align 1
75+
; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1
7576
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE24]]
7677
; CHECK: pred.store.continue24:
7778
; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
7879
; CHECK: pred.store.if25:
79-
; CHECK-NEXT: store i1 false, ptr [[P]], align 1
80+
; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1
8081
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE26]]
8182
; CHECK: pred.store.continue26:
8283
; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
8384
; CHECK: pred.store.if27:
84-
; CHECK-NEXT: store i1 false, ptr [[P]], align 1
85+
; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1
8586
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE28]]
8687
; CHECK: pred.store.continue28:
8788
; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
8889
; CHECK: pred.store.if29:
89-
; CHECK-NEXT: store i1 false, ptr [[P]], align 1
90+
; CHECK-NEXT: store i1 [[TMP0]], ptr [[P]], align 1
9091
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE30]]
9192
; CHECK: pred.store.continue30:
9293
; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]]

llvm/test/Transforms/LoopVectorize/X86/cost-model.ll

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -890,9 +890,7 @@ define i64 @cost_assume(ptr %end, i64 %N) {
890890
; CHECK: vector.ph:
891891
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
892892
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
893-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[N:%.*]], i64 0
894-
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
895-
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
893+
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i64 [[N:%.*]], 0
896894
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
897895
; CHECK: vector.body:
898896
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -904,7 +902,6 @@ define i64 @cost_assume(ptr %end, i64 %N) {
904902
; CHECK-NEXT: [[TMP8]] = add <2 x i64> [[VEC_PHI2]], splat (i64 1)
905903
; CHECK-NEXT: [[TMP9]] = add <2 x i64> [[VEC_PHI3]], splat (i64 1)
906904
; CHECK-NEXT: [[TMP10]] = add <2 x i64> [[VEC_PHI4]], splat (i64 1)
907-
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
908905
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
909906
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
910907
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])

llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -159,9 +159,6 @@ define void @versioned_sext_use_in_gep(i32 %scale, ptr %dst, i64 %scale.2) {
159159
; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[SCALE]], 1
160160
; CHECK-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
161161
; CHECK: vector.ph:
162-
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[SCALE_2]]
163-
; CHECK-NEXT: [[TMP81:%.*]] = getelementptr i8, ptr [[DST]], i64 [[SCALE_2]]
164-
; CHECK-NEXT: [[TMP82:%.*]] = getelementptr i8, ptr [[DST]], i64 [[SCALE_2]]
165162
; CHECK-NEXT: [[TMP83:%.*]] = getelementptr i8, ptr [[DST]], i64 [[SCALE_2]]
166163
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
167164
; CHECK: vector.body:
@@ -174,10 +171,10 @@ define void @versioned_sext_use_in_gep(i32 %scale, ptr %dst, i64 %scale.2) {
174171
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
175172
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]]
176173
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP16]]
177-
; CHECK-NEXT: store ptr [[TMP8]], ptr [[TMP11]], align 8
178-
; CHECK-NEXT: store ptr [[TMP8]], ptr [[TMP13]], align 8
179-
; CHECK-NEXT: store ptr [[TMP8]], ptr [[TMP15]], align 8
180-
; CHECK-NEXT: store ptr [[TMP8]], ptr [[TMP17]], align 8
174+
; CHECK-NEXT: store ptr [[TMP83]], ptr [[TMP11]], align 8
175+
; CHECK-NEXT: store ptr [[TMP83]], ptr [[TMP13]], align 8
176+
; CHECK-NEXT: store ptr [[TMP83]], ptr [[TMP15]], align 8
177+
; CHECK-NEXT: store ptr [[TMP83]], ptr [[TMP17]], align 8
181178
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
182179
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
183180
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]

0 commit comments

Comments
 (0)