Skip to content

Commit a9b2998

Browse files
committed
[VPlan] Skip cost assert if VPlan converted to single-scalar recipes.
Check if a VPlan transform converted recipes to single-scalar VPReplicateRecipes (after 07c085a). If that's the case, the legacy cost model incorrectly overestimates the cost. Fixes #141237.
1 parent af2a957 commit a9b2998

File tree

3 files changed

+140
-36
lines changed

3 files changed

+140
-36
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7082,6 +7082,11 @@ InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
70827082
return CM.getInstructionCost(UI, VF);
70837083
}
70847084

7085+
bool VPCostContext::isLegacyUniformAfterVectorization(Instruction *I,
7086+
ElementCount VF) const {
7087+
return CM.isUniformAfterVectorization(I, VF);
7088+
}
7089+
70857090
bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
70867091
return CM.ValuesToIgnore.contains(UI) ||
70877092
(IsVector && CM.VecValuesToIgnore.contains(UI)) ||
@@ -7315,7 +7320,8 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
73157320
/// cost-model did not account for.
73167321
static bool planContainsAdditionalSimplifications(VPlan &Plan,
73177322
VPCostContext &CostCtx,
7318-
Loop *TheLoop) {
7323+
Loop *TheLoop,
7324+
ElementCount VF) {
73197325
// First collect all instructions for the recipes in Plan.
73207326
auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
73217327
if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
@@ -7352,6 +7358,16 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
73527358
// comparing against the legacy cost isn't desirable.
73537359
if (isa<VPPartialReductionRecipe>(&R))
73547360
return true;
7361+
7362+
/// If a VPlan transform folded a recipe to one producing a single-scalar,
7363+
/// but the original instruction wasn't uniform-after-vectorization in the
7364+
/// legacy cost model, the legacy cost overestimates the actual cost.
7365+
if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
7366+
if (RepR->isSingleScalar() &&
7367+
!CostCtx.isLegacyUniformAfterVectorization(
7368+
RepR->getUnderlyingInstr(), VF))
7369+
return true;
7370+
}
73557371
if (Instruction *UI = GetInstructionForCost(&R)) {
73567372
// If we adjusted the predicate of the recipe, the cost in the legacy
73577373
// cost model may be different.
@@ -7477,9 +7493,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
74777493
// legacy cost model doesn't properly model costs for such loops.
74787494
assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
74797495
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
7480-
CostCtx, OrigLoop) ||
7481-
planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),
7482-
CostCtx, OrigLoop)) &&
7496+
CostCtx, OrigLoop,
7497+
BestFactor.Width) ||
7498+
planContainsAdditionalSimplifications(
7499+
getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
74837500
" VPlan cost model and legacy cost model disagreed");
74847501
assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
74857502
"when vectorizing, the scalar cost must be computed.");

llvm/lib/Transforms/Vectorize/VPlanHelpers.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,11 @@ struct VPCostContext {
364364

365365
/// Returns the OperandInfo for \p V, if it is a live-in.
366366
TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const;
367+
368+
/// Return true if \p I is considered uniform-after-vectorization in the
369+
/// legacy cost model for \p VF. Only used to check for additional VPlan
370+
/// simplifications.
371+
bool isLegacyUniformAfterVectorization(Instruction *I, ElementCount VF) const;
367372
};
368373

369374
/// This class can be used to assign names to VPValues. For VPValues without
Lines changed: 114 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
1-
; RUN: opt -passes=loop-vectorize -S -mcpu=core-avx2 < %s | FileCheck %s
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "middle.block:" --version 5
2+
; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s
3+
4+
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5+
target triple = "x86_64-unknown-linux-gnu"
6+
7+
@inc = global float 5.000000e-01, align 4
28

39
;float inc = 0.5;
410
;void foo(ptr A, unsigned N) {
@@ -8,40 +14,116 @@
814
; }
915
;}
1016

11-
; CHECK-LABEL: foo
12-
; CHECK: vector.body
13-
; CHECK: load <8 x float>
14-
; CHECK: fadd <8 x float>
15-
; CHECK: store <8 x float>
17+
define void @foo(ptr nocapture noalias %A, i64 %N) #0 {
18+
; CHECK-LABEL: define void @foo(
19+
; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
20+
; CHECK-NEXT: [[ENTRY:.*:]]
21+
; CHECK-NEXT: br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
22+
; CHECK: [[VECTOR_PH]]:
23+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
24+
; CHECK: [[VECTOR_BODY]]:
25+
; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr @inc, align 4
26+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP1]], i64 0
27+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
28+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 0
29+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
30+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8
31+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 16
32+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 24
33+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP3]], align 4
34+
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP4]], align 4
35+
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x float>, ptr [[TMP5]], align 4
36+
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr [[TMP6]], align 4
37+
; CHECK-NEXT: [[TMP7:%.*]] = fadd <8 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
38+
; CHECK-NEXT: [[TMP8:%.*]] = fadd <8 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD2]]
39+
; CHECK-NEXT: [[TMP9:%.*]] = fadd <8 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD3]]
40+
; CHECK-NEXT: [[TMP10:%.*]] = fadd <8 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD4]]
41+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
42+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8
43+
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 16
44+
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 24
45+
; CHECK-NEXT: store <8 x float> [[TMP7]], ptr [[TMP14]], align 4
46+
; CHECK-NEXT: store <8 x float> [[TMP8]], ptr [[TMP11]], align 4
47+
; CHECK-NEXT: store <8 x float> [[TMP9]], ptr [[TMP12]], align 4
48+
; CHECK-NEXT: store <8 x float> [[TMP10]], ptr [[TMP13]], align 4
49+
; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
50+
; CHECK: [[MIDDLE_BLOCK]]:
51+
;
52+
entry:
53+
br label %loop
1654

17-
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
18-
target triple = "x86_64-unknown-linux-gnu"
55+
loop:
56+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
57+
%l.inc = load float, ptr @inc, align 4
58+
%gep.A = getelementptr inbounds float, ptr %A, i64 %iv
59+
%l.A = load float, ptr %gep.A, align 4
60+
%add = fadd float %l.inc, %l.A
61+
store float %add, ptr %gep.A, align 4
62+
%iv.next = add nuw nsw i64 %iv, 1
63+
%ec = icmp eq i64 %iv.next, 32
64+
br i1 %ec, label %exit, label %loop
1965

20-
@inc = global float 5.000000e-01, align 4
66+
exit:
67+
ret void
68+
}
2169

22-
define void @foo(ptr nocapture %A, i32 %N) #0 {
70+
define void @uniform_load_can_fold_users(ptr noalias %src, ptr %dst, i64 %start, double %d) {
71+
; CHECK-LABEL: define void @uniform_load_can_fold_users(
72+
; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]], i64 [[START:%.*]], double [[D:%.*]]) {
73+
; CHECK-NEXT: [[ENTRY:.*:]]
74+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[START]], 1
75+
; CHECK-NEXT: [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 [[START]], i64 0)
76+
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[SMIN]]
77+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 2
78+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
79+
; CHECK: [[VECTOR_PH]]:
80+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 2
81+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
82+
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[START]], [[N_VEC]]
83+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
84+
; CHECK: [[VECTOR_BODY]]:
85+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
86+
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
87+
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1
88+
; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[SRC]], align 8
89+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i64 0
90+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
91+
; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[BROADCAST_SPLAT]], splat (double 9.000000e+00)
92+
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
93+
; CHECK-NEXT: [[TMP8:%.*]] = fdiv double [[TMP7]], [[D]]
94+
; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP3]], 1
95+
; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP4]], 1
96+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP3]]
97+
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP4]]
98+
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP11]], i64 [[TMP9]]
99+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[TMP12]], i64 [[TMP10]]
100+
; CHECK-NEXT: store double [[TMP8]], ptr [[TMP13]], align 8
101+
; CHECK-NEXT: store double [[TMP8]], ptr [[TMP14]], align 8
102+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
103+
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
104+
; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
105+
; CHECK: [[MIDDLE_BLOCK]]:
106+
;
23107
entry:
24-
%cmp3 = icmp eq i32 %N, 0
25-
br i1 %cmp3, label %for.end, label %for.body.preheader
26-
27-
for.body.preheader: ; preds = %entry
28-
br label %for.body
29-
30-
for.body: ; preds = %for.body.preheader, %for.body
31-
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
32-
%0 = load float, ptr @inc, align 4
33-
%arrayidx = getelementptr inbounds float, ptr %A, i64 %indvars.iv
34-
%1 = load float, ptr %arrayidx, align 4
35-
%add = fadd float %0, %1
36-
store float %add, ptr %arrayidx, align 4
37-
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
38-
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
39-
%exitcond = icmp eq i32 %lftr.wideiv, %N
40-
br i1 %exitcond, label %for.end.loopexit, label %for.body
41-
42-
for.end.loopexit: ; preds = %for.body
43-
br label %for.end
44-
45-
for.end: ; preds = %for.end.loopexit, %entry
108+
br label %loop
109+
110+
loop:
111+
%iv.1 = phi i64 [ 0, %entry ], [ %iv.1.next, %loop ]
112+
%iv.2 = phi i64 [ %start, %entry ], [ %iv.2.next, %loop ]
113+
%l = load double, ptr %src, align 8
114+
%m = fmul double %l, 9.0
115+
%div = fdiv double %m, %d
116+
%sub = sub i64 %iv.1, 1
117+
%gep.1 = getelementptr double, ptr %dst, i64 %iv.1
118+
%gep.2 = getelementptr double, ptr %gep.1, i64 %sub
119+
store double %div, ptr %gep.2, align 8
120+
%iv.1.next = add i64 %iv.1, 1
121+
%iv.2.next = add i64 %iv.2, -1
122+
%ec = icmp sgt i64 %iv.2, 0
123+
br i1 %ec , label %loop, label %exit
124+
125+
exit:
46126
ret void
47127
}
128+
129+
attributes #0 = { "target-cpu"="core-avx2" }

0 commit comments

Comments
 (0)