-
Notifications
You must be signed in to change notification settings - Fork 13.8k
[LV] Use ICMP_UGE for BranchOnCount when VF is scalable #102575
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[LV] Use ICMP_UGE for BranchOnCount when VF is scalable #102575
Conversation
Post it here to see if it is the right way to fix this isssue in LV, because I can't fix it in SCEV. |
ea3c380
to
bdc69c8
Compare
@llvm/pr-subscribers-llvm-transforms Author: Pengcheng Wang (wangpc-pp) ChangesSo that SCEV can analyse the bound of loop count. This can fix issue found in #100564. Patch is 844.75 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/102575.diff 70 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1a93f275a39f5f..f40eebcf33bb8c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -524,7 +524,10 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
// First create the compare.
Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);
Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);
- Value *Cond = Builder.CreateICmpEQ(IV, TC);
+ // Use ICMP_UGE so that SCEV can analyse the bound of loop count for
+ // scalable VF.
+ Value *Cond = Builder.CreateICmp(
+ State.VF.isScalable() ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_EQ, IV, TC);
// Now create the branch.
auto *Plan = getParent()->getPlan();
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 78452a9c884eed..d18add49b38a8c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -709,7 +709,7 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
; DEFAULT-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP16]], i32 0
; DEFAULT-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> zeroinitializer, ptr [[TMP17]], i32 4, <vscale x 4 x i1> [[TMP15]]), !alias.scope [[META20:![0-9]+]], !noalias [[META21:![0-9]+]]
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
-; DEFAULT-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT: [[TMP18:%.*]] = icmp uge i64 [[INDEX_NEXT]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; DEFAULT: middle.block:
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index bce2d6c14d8668..cfcfed7394f6ca 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -60,7 +60,7 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
; CHECK-NEXT: store <vscale x 2 x double> zeroinitializer, ptr [[TMP36]], align 8
; CHECK-NEXT: store <vscale x 2 x double> zeroinitializer, ptr [[TMP39]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: [[TMP40:%.*]] = icmp uge i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
index 8c50d86489c9dd..f0e6ab86b089ea 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
@@ -28,7 +28,7 @@ define void @f1(ptr %A) #0 {
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
; CHECK-NEXT: store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp uge i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
index 763b3e0bc82930..b48e82693e1d70 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
@@ -38,6 +38,32 @@ define dso_local double @test(ptr nocapture noundef readonly %data, ptr nocaptur
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP16:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP14]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi double [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: ret double [[RES_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[RES_07:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[OFFSET]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[IDXPROM1:%.*]] = sext i32 [[TMP17]] to i64
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[IDXPROM1]]
+; CHECK-NEXT: [[TMP18:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT: [[ADD]] = fadd double [[RES_07]], [[TMP18]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
;
; SVE-LABEL: @test(
; SVE-NEXT: entry:
@@ -54,23 +80,49 @@ define dso_local double @test(ptr nocapture noundef readonly %data, ptr nocaptur
; SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; SVE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
; SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; SVE: vector.body:
; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x double> [ insertelement (<vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer), double 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; SVE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; SVE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[OFFSET:%.*]], i64 [[TMP4]]
-; SVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP6]], align 4
-; SVE-NEXT: [[TMP7:%.*]] = sext <vscale x 2 x i32> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DATA:%.*]], <vscale x 2 x i64> [[TMP7]]
-; SVE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP8]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x double> poison)
-; SVE-NEXT: [[TMP9]] = fadd <vscale x 2 x double> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; SVE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x double> [ insertelement (<vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer), double 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[OFFSET:%.*]], i64 [[TMP6]]
+; SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP8]], align 4
+; SVE-NEXT: [[TMP9:%.*]] = sext <vscale x 2 x i32> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, ptr [[DATA:%.*]], <vscale x 2 x i64> [[TMP9]]
+; SVE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP10]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x double> poison)
+; SVE-NEXT: [[TMP11]] = fadd <vscale x 2 x double> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
+; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SVE-NEXT: [[TMP12:%.*]] = icmp uge i64 [[INDEX_NEXT]], [[N_VEC]]
; SVE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SVE: middle.block:
+; SVE-NEXT: [[TMP13:%.*]] = call double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> [[TMP11]])
+; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; SVE: scalar.ph:
+; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; SVE-NEXT: br label [[FOR_BODY:%.*]]
+; SVE: for.cond.cleanup.loopexit:
+; SVE-NEXT: [[ADD_LCSSA:%.*]] = phi double [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; SVE-NEXT: br label [[FOR_COND_CLEANUP]]
+; SVE: for.cond.cleanup:
+; SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; SVE-NEXT: ret double [[RES_0_LCSSA]]
+; SVE: for.body:
+; SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; SVE-NEXT: [[RES_07:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
+; SVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[OFFSET]], i64 [[INDVARS_IV]]
+; SVE-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; SVE-NEXT: [[IDXPROM1:%.*]] = sext i32 [[TMP14]] to i64
+; SVE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[IDXPROM1]]
+; SVE-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; SVE-NEXT: [[ADD]] = fadd double [[RES_07]], [[TMP15]]
+; SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
;
entry:
%cmp6 = icmp sgt i32 %size, 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index edba5ee1d7f9eb..f26d533e57e453 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -76,7 +76,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
; DEFAULT-NEXT: store <vscale x 8 x i8> [[TMP36]], ptr [[TMP40]], align 1
; DEFAULT-NEXT: store <vscale x 8 x i8> [[TMP37]], ptr [[TMP43]], align 1
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
-; DEFAULT-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT: [[TMP44:%.*]] = icmp uge i64 [[INDEX_NEXT]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; DEFAULT: middle.block:
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -115,7 +115,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
; DEFAULT-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[TMP61]], i32 0
; DEFAULT-NEXT: store <vscale x 4 x i8> [[TMP60]], ptr [[TMP62]], align 1
; DEFAULT-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX10]], [[TMP50]]
-; DEFAULT-NEXT: [[TMP63:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC6]]
+; DEFAULT-NEXT: [[TMP63:%.*]] = icmp uge i64 [[INDEX_NEXT12]], [[N_VEC6]]
; DEFAULT-NEXT: br i1 [[TMP63]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; DEFAULT: vec.epilog.middle.block:
; DEFAULT-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC6]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 40ea7056ff6e48..433daedc5ce7a7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -31,7 +31,7 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
; TFNONE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
; TFNONE-NEXT: store <vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], align 8
; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; TFNONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; TFNONE-NEXT: [[TMP9:%.*]] = icmp uge i64 [[INDEX_NEXT]], [[N_VEC]]
; TFNONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; TFNONE: middle.block:
; TFNONE-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -55,27 +55,27 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
; TFCOMMON-NEXT: entry:
; TFCOMMON-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; TFCOMMON-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; TFCOMMON-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; TFCOMMON-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
+; TFCOMMON-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; TFCOMMON-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]]
; TFCOMMON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; TFCOMMON-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; TFCOMMON-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; TFCOMMON-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; TFCOMMON-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; TFCOMMON-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
; TFCOMMON-NEXT: br label [[VECTOR_BODY:%.*]]
; TFCOMMON: vector.body:
; TFCOMMON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TFCOMMON-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; TFCOMMON-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFCOMMON-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; TFCOMMON-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFCOMMON-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFCOMMON-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFCOMMON-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
-; TFCOMMON-NEXT: [[TMP10:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TFCOMMON-NEXT: [[TMP11:%.*]] = extractelement <vscale x 2 x i1> [[TMP10]], i32 0
-; TFCOMMON-NEXT: br i1 [[TMP11]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; TFCOMMON-NEXT: [[TMP8:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFCOMMON-NEXT: [[TMP9:%.*]] = extractelement <vscale x 2 x i1> [[TMP8]], i32 0
+; TFCOMMON-NEXT: br i1 [[TMP9]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; TFCOMMON: for.cond.cleanup:
; TFCOMMON-NEXT: ret void
;
@@ -83,44 +83,44 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
; TFA_INTERLEAVE-NEXT: entry:
; TFA_INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; TFA_INTERLEAVE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
+; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; TFA_INTERLEAVE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]]
; TFA_INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_I...
[truncated]
|
Ping. Any comments? |
@@ -38,6 +38,32 @@ define dso_local double @test(ptr nocapture noundef readonly %data, ptr nocaptur | |||
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 | |||
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] | |||
; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] | |||
; CHECK: middle.block: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Did this code exist before and just wasn't being checked?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Double checked and it existed before.
Why can't this be fixed in SCEV?
could you elaborate on why this can't be fixed in SCEV? |
Yes. The problem here is we found that we can't do LoopTermFold for some loops when doing scalable vectorization: https://godbolt.org/z/5fshc7Enq. After some investigations, the root cause in LoopTermFold is because
And the reason why it returns false is because SCEV can't know the right bound of loop count. IIRC, it's because we can't analyse the right exit limit for ICMP_NE and ICMP_EQ cases in ScalarEvolution::computeExitLimitFromICmp when the exit condition is a comparison of a scalable value:llvm-project/llvm/lib/Analysis/ScalarEvolution.cpp Lines 9182 to 9215 in 985d64b
Before this patch: vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%9 = getelementptr inbounds float, ptr %b, i64 %index
%wide.load = load <vscale x 4 x float>, ptr %9, align 4, !tbaa !9
%10 = fadd <vscale x 4 x float> %wide.load, shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
%11 = getelementptr inbounds float, ptr %a, i64 %index
store <vscale x 4 x float> %10, ptr %11, align 4, !tbaa !9
%index.next = add nuw i64 %index, %8
%12 = icmp eq i64 %index.next, %n.vec
br i1 %12, label %middle.block, label %vector.body, !llvm.loop !13
After this patch: vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%9 = getelementptr inbounds float, ptr %b, i64 %index
%wide.load = load <vscale x 4 x float>, ptr %9, align 4, !tbaa !9
%10 = fadd <vscale x 4 x float> %wide.load, shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
%11 = getelementptr inbounds float, ptr %a, i64 %index
store <vscale x 4 x float> %10, ptr %11, align 4, !tbaa !9
%index.next = add nuw i64 %index, %8
%.not = icmp ult i64 %index.next, %n.vec
br i1 %.not, label %vector.body, label %middle.block, !llvm.loop !13
|
So that SCEV can analyse the bound of loop count. This can fix issue found in llvm#100564.
bdc69c8
to
587674a
Compare
Gentle ping. @fhahn |
Gentle ping. |
I have expressed this elsewhere, but I want to note on the review that I don't believe this to be the right approach. Teaching SCEV to better understand scalable VFs is clearly the right answer long term. It has some hard parts, but working on that as opposed to working around it like this is clearly the right long term approach. |
So that SCEV can analyse the bound of loop count.
This can fix issue found in #100564.