-
Notifications
You must be signed in to change notification settings - Fork 13.7k
[VPlan] Replace ExtractFromEnd with Extract(Last|Penultimate)Element (NFC). #137030
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
ExtractFromEnd only has 2 uses, extracting the last and penultimate lanes. Replace it with 2 separate opcodes, removing the need to materialize and handle a constant argument.
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-vectorizers Author: Florian Hahn (fhahn) ChangesExtractFromEnd only has 2 uses, extracting the last and penultimate lanes. Replace it with 2 separate opcodes, removing the need to materialize and handle a constant argument. Patch is 20.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137030.diff 13 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e2f7c36cecd9d..0ee4216933cee 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9326,8 +9326,6 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
VPBuilder ScalarPHBuilder(ScalarPH);
- VPValue *OneVPV = Plan.getOrAddLiveIn(
- ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
@@ -9362,7 +9360,7 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
"Cannot handle loops with uncountable early exits");
if (IsFOR)
ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
- VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {},
+ VPInstruction::ExtractLastLane, {ResumeFromVectorLoop}, {},
"vector.recur.extract");
StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
auto *ResumePhiR = ScalarPHBuilder.createNaryOp(
@@ -9440,8 +9438,6 @@ static void addExitUsersForFirstOrderRecurrences(
auto *MiddleVPBB = Plan.getMiddleBlock();
VPBuilder ScalarPHBuilder(ScalarPHVPBB);
VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
- VPValue *TwoVPV = Plan.getOrAddLiveIn(
- ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2));
for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
@@ -9525,7 +9521,7 @@ static void addExitUsersForFirstOrderRecurrences(
if (ExitIRI->getOperand(0) != FOR)
continue;
VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
- VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
+ VPInstruction::ExtractPenultimateLane, {FOR->getBackedgeValue()}, {},
"vector.recur.extract.for.phi");
ExitIRI->setOperand(0, PenultimateElement);
ExitUsersToFix.remove(ExitIRI);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0f2aac146e7a6..94382a2d7af57 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -878,10 +878,10 @@ class VPInstruction : public VPRecipeWithIRFlags,
Broadcast,
ComputeFindLastIVResult,
ComputeReductionResult,
- // Takes the VPValue to extract from as first operand and the lane or part
- // to extract as second operand, counting from the end starting with 1 for
- // last. The second operand must be a positive constant and <= VF.
- ExtractFromEnd,
+ // Extracts the last lane from its operand.
+ ExtractLastLane,
+ // Extracts the second-to-last lane from its operand.
+ ExtractPenultimateLane,
LogicalAnd, // Non-poison propagating logical And.
// Add an offset in bytes (second operand) to a base pointer (first
// operand). Only generates scalar values (either for the first lane only or
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 375d4c9787994..946f1db4a8885 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -88,7 +88,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return SetResultTyFromOp();
case VPInstruction::FirstActiveLane:
return Type::getIntNTy(Ctx, 64);
- case VPInstruction::ExtractFromEnd: {
+ case VPInstruction::ExtractLastLane:
+ case VPInstruction::ExtractPenultimateLane: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
return VecTy->getElementType();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index a342d5182c974..d6fbfb56b967a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -725,10 +725,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
return ReducedPartRdx;
}
- case VPInstruction::ExtractFromEnd: {
- auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
- unsigned Offset = CI->getZExtValue();
- assert(Offset > 0 && "Offset from end must be positive");
+ case VPInstruction::ExtractLastLane:
+ case VPInstruction::ExtractPenultimateLane: {
+ unsigned Offset = getOpcode() == VPInstruction::ExtractLastLane ? 1 : 2;
Value *Res;
if (State.VF.isVector()) {
assert(Offset <= State.VF.getKnownMinValue() &&
@@ -854,7 +853,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}
bool VPInstruction::isVectorToScalar() const {
- return getOpcode() == VPInstruction::ExtractFromEnd ||
+ return getOpcode() == VPInstruction::ExtractLastLane ||
+ getOpcode() == VPInstruction::ExtractPenultimateLane ||
getOpcode() == Instruction::ExtractElement ||
getOpcode() == VPInstruction::FirstActiveLane ||
getOpcode() == VPInstruction::ComputeFindLastIVResult ||
@@ -924,7 +924,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::AnyOf:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
- case VPInstruction::ExtractFromEnd:
+ case VPInstruction::ExtractLastLane:
+ case VPInstruction::ExtractPenultimateLane:
case VPInstruction::FirstActiveLane:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
@@ -1042,8 +1043,11 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::Broadcast:
O << "broadcast";
break;
- case VPInstruction::ExtractFromEnd:
- O << "extract-from-end";
+ case VPInstruction::ExtractLastLane:
+ O << "extract-last-lane";
+ break;
+ case VPInstruction::ExtractPenultimateLane:
+ O << "extract-penultimate-lane";
break;
case VPInstruction::ComputeFindLastIVResult:
O << "compute-find-last-iv-result";
@@ -1143,12 +1147,7 @@ void VPIRInstruction::extractLastLaneOfOperand(VPBuilder &Builder) {
assert(getNumOperands() == 1 && "must have a single operand");
VPValue *Exiting = getOperand(0);
if (!Exiting->isLiveIn()) {
- LLVMContext &Ctx = getInstruction().getContext();
- auto &Plan = *getParent()->getPlan();
- Exiting = Builder.createNaryOp(
- VPInstruction::ExtractFromEnd,
- {Exiting,
- Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::get(Ctx, 32), 1))});
+ Exiting = Builder.createNaryOp(VPInstruction::ExtractLastLane, {Exiting});
}
setOperand(0, Exiting);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f2dc68b2ea8b6..f5e4a466f4b37 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -827,8 +827,8 @@ optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo,
using namespace VPlanPatternMatch;
VPValue *Incoming;
- if (!match(Op, m_VPInstruction<VPInstruction::ExtractFromEnd>(
- m_VPValue(Incoming), m_SpecificInt(1))))
+ if (!match(Op, m_VPInstruction<VPInstruction::ExtractLastLane>(
+ m_VPValue(Incoming))))
return nullptr;
auto *WideIV = getOptimizableIVOf(Incoming);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index b48a447834cc8..e404b1d09b452 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -337,14 +337,19 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
continue;
}
VPValue *Op0;
- if (match(&R, m_VPInstruction<VPInstruction::ExtractFromEnd>(
- m_VPValue(Op0), m_VPValue(Op1)))) {
+ if (match(&R, m_VPInstruction<VPInstruction::ExtractLastLane>(
+ m_VPValue(Op0))) ||
+ match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateLane>(
+ m_VPValue(Op0)))) {
addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
if (Plan.hasScalarVFOnly()) {
// Extracting from end with VF = 1 implies retrieving the scalar part UF
// - Op1.
unsigned Offset =
- cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue();
+ match(&R,
+ m_VPInstruction<VPInstruction::ExtractLastLane>(m_VPValue()))
+ ? 1
+ : 2;
R.getVPSingleValue()->replaceAllUsesWith(
getValueForPart(Op0, UF - Offset));
R.eraseFromParent();
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index 24badfb7b1042..ef72d56d082d8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -42,7 +42,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]>
-; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1>
+; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-last-lane vp<[[RED_RESULT]]>
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<[[VEC_TC]]>
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
@@ -107,7 +107,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<%accum>, ir<%add>
-; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1>
+; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-last-lane vp<[[RED_RESULT]]>
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, ir<1024>
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index bfaa09241e30b..4fcb2c9fe96c3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -62,7 +62,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; IF-EVL-OUTLOOP-EMPTY:
; IF-EVL-OUTLOOP-NEXT: middle.block:
; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, vp<[[RDX_SELECT]]>
-; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
+; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-last-lane vp<[[RDX]]>
; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>
; IF-EVL-OUTLOOP-EMPTY:
; IF-EVL-OUTLOOP-NEXT: ir-bb<for.end>:
@@ -102,7 +102,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; IF-EVL-INLOOP-EMPTY:
; IF-EVL-INLOOP-NEXT: middle.block:
; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
-; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
+; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-last-lane vp<[[RDX]]>
; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.end>
; IF-EVL-INLOOP-EMPTY:
; IF-EVL-INLOOP-NEXT: ir-bb<for.end>:
@@ -137,7 +137,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-EMPTY:
; NO-VP-OUTLOOP-NEXT: middle.block:
; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
-; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
+; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-last-lane vp<[[RDX]]>
; NO-VP-OUTLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
; NO-VP-OUTLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]>
; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
@@ -185,7 +185,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-INLOOP-EMPTY:
; NO-VP-INLOOP-NEXT: middle.block:
; NO-VP-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
-; NO-VP-INLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
+; NO-VP-INLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-last-lane vp<[[RDX]]>
; NO-VP-INLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
; NO-VP-INLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]>
; NO-VP-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
index 8624d9201a7c8..0d47163e109cc 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
@@ -37,8 +37,8 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) {
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%for.1.next>, ir<1>
-; CHECK-NEXT: EMIT vp<[[RESUME_2:%.+]]>.1 = extract-from-end vp<[[FOR1_SPLICE]]>, ir<1>
+; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-last-lane ir<%for.1.next>
+; CHECK-NEXT: EMIT vp<[[RESUME_2:%.+]]>.1 = extract-last-lane vp<[[FOR1_SPLICE]]>
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1000>, vp<[[VTC]]>
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
@@ -117,9 +117,9 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) {
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%for.1.next>, ir<1>
-; CHECK-NEXT: EMIT vp<[[RESUME_2:%.+]]>.1 = extract-from-end vp<[[FOR1_SPLICE]]>, ir<1>
-; CHECK-NEXT: EMIT vp<[[RESUME_3:%.+]]>.2 = extract-from-end vp<[[FOR2_SPLICE]]>, ir<1>
+; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-last-lane ir<%for.1.next>
+; CHECK-NEXT: EMIT vp<[[RESUME_2:%.+]]>.1 = extract-last-lane vp<[[FOR1_SPLICE]]>
+; CHECK-NEXT: EMIT vp<[[RESUME_3:%.+]]>.2 = extract-last-lane vp<[[FOR2_SPLICE]]>
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1000>, vp<[[VTC]]>
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
@@ -203,8 +203,8 @@ define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) {
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<[[EXT_X:%.+]]> = extract-from-end ir<%for.x.next>, ir<1>
-; CHECK-NEXT: EMIT vp<[[EXT_Y:%.+]]>.1 = extract-from-end ir<%for.x.prev>, ir<1>
+; CHECK-NEXT: EMIT vp<[[EXT_X:%.+]]> = extract-last-lane ir<%for.x.next>
+; CHECK-NEXT: EMIT vp<[[EXT_Y:%.+]]>.1 = extract-last-lane ir<%for.x.prev>
; CHECK-NEXT: EMIT vp<[[MIDDLE_C:%.+]]> = icmp eq ir<4098>, vp<[[VTC]]>
; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_C]]>
; CHECK-NEXT: Successor(s): ir-bb<ret>, scalar.ph
@@ -282,8 +282,8 @@ define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) {
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<[[EXT_X:%.+]]> = extract-from-end ir<%for.x.next>, ir<1>
-; CHECK-NEXT: EMIT vp<[[EXT_Y:%.+]]>.1 = extract-from-end ir<%for.x.prev>, ir<1>
+; CHECK-NEXT: EMIT vp<[[EXT_X:%.+]]> = extract-last-lane ir<%for.x.next>
+; CHECK-NEXT: EMIT vp<[[EXT_Y:%.+]]>.1 = extract-last-lane ir<%for.x.prev>
; CHECK-NEXT: EMIT vp<[[MIDDLE_C:%.+]]> = icmp eq ir<4098>, vp<[[VTC]]>
; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_C]]>
; CHECK-NEXT: Successor(s): ir-bb<ret>, scalar.ph
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 794ff99a68672..0805cf3bcb01b 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -214,7 +214,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%and.red>, vp<[[SEL]]>
-; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1>
+; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-last-lane vp<[[RED_RES]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<exit>
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index 671ed2af52621..dd400737bef01 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -220,7 +220,7 @@ exit:
; DBG-NEXT: Successor(s): middle.block
; DBG-EMPTY:
; DBG-NEXT: middle.block:
-; DBG-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-from-end vp<[[SCALAR_STEPS]]>, ir<1>
+; DBG-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-last-lane vp<[[SCALAR_STEPS]]>
; DBG-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]>
; DBG-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; DBG-NEXT: Successor(s): ir-bb<exit>, scalar.ph
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index d0d276ab967fa..0709372890dac 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -35,7 +35,7 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%red>, ir<%red.next>
-; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1>
+; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-last-lane vp<[[RED_RES]]>
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
@@ -176,7 +176,7 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%sum.07>, ir<[[MULADD]]>
-; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1>
+; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-last-lane vp<[[RED_RES]]>
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
@@ -238,7 +238,7 @@ define i64 @find_last_iv(ptr %a, i64 %n, i64 %start) {
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[RDX_RES:%.+]]> = compute-find-last-iv-result ir<%rdx>, ir<%start>, ir<%cond>
-; CHECK-NEXT: EMIT vp<[[EXT:%.+]]> = extract-from-end vp<[[RDX_RES]]>, ir<1>
+; CHECK-NEXT: EMIT vp<[[EXT:%.+]]> = extract-last-lane vp<[[RDX_RES]]>
; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%n>, vp<{{.+}}>
; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index fc80d348042a0..b4e97a401c58e 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -538,7 +538,7 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<[[EXIT:%.+]]> = extract-from-end ir<%add>, ir<1>
+; CHECK-NEXT: EMIT vp<[[EXIT:%.+]]> = extr...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Specializing to two distinct opcodes with suitable names LGTM!
ExtractFromEnd works for all vectors, naturally limiting its offset to at most 2.
The usages of last vs. penultimate are conceptually distinct, naturally limiting its offset to be an immediate/immutable value.
Another option is to encode the immediate offset using an IRFlags flag (1 bit suffices), given that both extracts need no other flag. Would that be better?
@@ -1143,12 +1147,7 @@ void VPIRInstruction::extractLastLaneOfOperand(VPBuilder &Builder) { | |||
assert(getNumOperands() == 1 && "must have a single operand"); | |||
VPValue *Exiting = getOperand(0); | |||
if (!Exiting->isLiveIn()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (!Exiting->isLiveIn()) { | |
if (Exiting->isLiveIn()) | |
return; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done thanks
// Extracting from end with VF = 1 implies retrieving the scalar part UF | ||
// - Op1. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"Op1" is gone:
// Extracting from end with VF = 1 implies retrieving the scalar part UF | |
// - Op1. | |
// Extracting from end with VF = 1 implies retrieving the last or penultimate scalar part (UF-1 or UF-2). |
In this case we're extracting a last/penultimate "Part" rather than "Lane". Perhaps better call the opcodes ExtractLast/ExtractPenultimate, or ExtractLastElement/ExtractPenultimateElement.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
if (match(&R, m_VPInstruction<VPInstruction::ExtractLastLane>( | ||
m_VPValue(Op0))) || | ||
match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateLane>( | ||
m_VPValue(Op0)))) { | ||
addUniformForAllParts(cast<VPSingleDefRecipe>(&R)); | ||
if (Plan.hasScalarVFOnly()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Handle simpler vector VF case first?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can do separately, leaving as-is for now to not pull in unrelated changes.
if (match(&R, m_VPInstruction<VPInstruction::ExtractLastLane>( | ||
m_VPValue(Op0))) || | ||
match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateLane>( | ||
m_VPValue(Op0)))) { | ||
addUniformForAllParts(cast<VPSingleDefRecipe>(&R)); | ||
if (Plan.hasScalarVFOnly()) { | ||
// Extracting from end with VF = 1 implies retrieving the scalar part UF | ||
// - Op1. | ||
unsigned Offset = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Trying to simplify:
unsigned Offset = | |
auto *I = dyn_cast<VPInstruction>(&R); // Place above for reuse. Similar to H below. | |
unsigned Offset = I->getOpcode() == VPInstruction::ExtractLast ? 1 : 2; |
consistent with how VPInstruction::generate() computes Offset?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done, using cast, thanks
// Extracts the last lane from its operand. | ||
ExtractLastLane, | ||
// Extracts the second-to-last lane from its operand. | ||
ExtractPenultimateLane, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Extract last applies to both lane (when VF>1) and part (when VF=1), so better call it ExtractLast
?
Extract penultimate applies to lane only (demands VF>1), so ExtractPenultimateLane
is accurate.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Specializing to two distinct opcodes with suitable names LGTM! ExtractFromEnd works for all vectors, naturally limiting its offset to at most 2. The usages of last vs. penultimate are conceptually distinct, naturally limiting its offset to be an immediate/immutable value.
Another option is to encode the immediate offset using an IRFlags flag (1 bit suffices), given that both extracts need no other flag. Would that be better?
I had a look at other potential users of such immediate offsets, but it seems like there aren't any other takers at the moment, so it might be better to go with the different opcodes for now?
// Extracts the last lane from its operand. | ||
ExtractLastLane, | ||
// Extracts the second-to-last lane from its operand. | ||
ExtractPenultimateLane, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
@@ -1143,12 +1147,7 @@ void VPIRInstruction::extractLastLaneOfOperand(VPBuilder &Builder) { | |||
assert(getNumOperands() == 1 && "must have a single operand"); | |||
VPValue *Exiting = getOperand(0); | |||
if (!Exiting->isLiveIn()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done thanks
if (match(&R, m_VPInstruction<VPInstruction::ExtractLastLane>( | ||
m_VPValue(Op0))) || | ||
match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateLane>( | ||
m_VPValue(Op0)))) { | ||
addUniformForAllParts(cast<VPSingleDefRecipe>(&R)); | ||
if (Plan.hasScalarVFOnly()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can do separately, leaving as-is for now to not pull in unrelated changes.
// Extracting from end with VF = 1 implies retrieving the scalar part UF | ||
// - Op1. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
if (match(&R, m_VPInstruction<VPInstruction::ExtractLastLane>( | ||
m_VPValue(Op0))) || | ||
match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateLane>( | ||
m_VPValue(Op0)))) { | ||
addUniformForAllParts(cast<VPSingleDefRecipe>(&R)); | ||
if (Plan.hasScalarVFOnly()) { | ||
// Extracting from end with VF = 1 implies retrieving the scalar part UF | ||
// - Op1. | ||
unsigned Offset = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done, using cast, thanks
// Extracts the second-to-last lane from its operand. | ||
ExtractPenultimateLane, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This probably should be further clarified: extract penultimate is used also for part when VF=1, but gets replaced by hooking up to penultimate part during unrolling-by-UF (right?). So during code-gen extract penultimate expects VF>1, i.e., it's per lane rather than per part; but so does extract last? So perhaps better rename ExtractPenultimate
too, explaining that both apply to lane or part, where the latter are abstract - dissolved prior to code-gen?
Could also have ExtractLastElement
and ExtractPenultimateElement
, leaving the element to be either lane or full part.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, updated in the latest version to use Element
suffix for both, mention that for the scalar case it extracts the last/second-to-last part and that in this case it gets removed by unrolling.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
// Extracting from end with VF = 1 implies retrieving the scalar part UF | ||
// - Op1. | ||
auto *I = cast<VPInstruction>(&R); | ||
// Extracting from end with VF = 1 implies retrieving retrieving the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Extracting from end with VF = 1 implies retrieving retrieving the | |
// Extracting from end with VF = 1 implies retrieving the |
…(NFC). (llvm#137030) ExtractFromEnd only has 2 uses, extracting the last and penultimate elements. Replace it with 2 separate opcodes, removing the need to materialize and handle a constant argument. PR: llvm#137030
…te)Element (NFC). (#137030) ExtractFromEnd only has 2 uses, extracting the last and penultimate elements. Replace it with 2 separate opcodes, removing the need to materialize and handle a constant argument. PR: llvm/llvm-project#137030
…(NFC). (llvm#137030) ExtractFromEnd only has 2 uses, extracting the last and penultimate elements. Replace it with 2 separate opcodes, removing the need to materialize and handle a constant argument. PR: llvm#137030
…(NFC). (llvm#137030) ExtractFromEnd only has 2 uses, extracting the last and penultimate elements. Replace it with 2 separate opcodes, removing the need to materialize and handle a constant argument. PR: llvm#137030
…(NFC). (llvm#137030) ExtractFromEnd only has 2 uses, extracting the last and penultimate elements. Replace it with 2 separate opcodes, removing the need to materialize and handle a constant argument. PR: llvm#137030
…(NFC). (llvm#137030) ExtractFromEnd only has 2 uses, extracting the last and penultimate elements. Replace it with 2 separate opcodes, removing the need to materialize and handle a constant argument. PR: llvm#137030
ExtractFromEnd only has 2 uses, extracting the last and penultimate elements. Replace it with 2 separate opcodes, removing the need to materialize and handle a constant argument.