[VPlan] Replace ExtractFromEnd with Extract(Last|Penultimate)Element (NFC). #137030

fhahn · 2025-04-23T18:01:12Z

ExtractFromEnd only has 2 uses, extracting the last and penultimate elements. Replace it with 2 separate opcodes, removing the need to materialize and handle a constant argument.

ExtractFromEnd only has 2 uses, extracting the last and penultimate lanes. Replace it with 2 separate opcodes, removing the need to materialize and handle a constant argument.

llvmbot · 2025-04-23T18:01:51Z

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-vectorizers

Author: Florian Hahn (fhahn)

Changes

ExtractFromEnd only has 2 uses, extracting the last and penultimate lanes. Replace it with 2 separate opcodes, removing the need to materialize and handle a constant argument.

Patch is 20.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137030.diff

13 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+2-6)
(modified) llvm/lib/Transforms/Vectorize/VPlan.h (+4-4)
(modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+2-1)
(modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+13-14)
(modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+2-2)
(modified) llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp (+8-3)
(modified) llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll (+2-2)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll (+4-4)
(modified) llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll (+9-9)
(modified) llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll (+1-1)
(modified) llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll (+1-1)
(modified) llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll (+3-3)
(modified) llvm/test/Transforms/LoopVectorize/vplan-printing.ll (+3-3)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e2f7c36cecd9d..0ee4216933cee 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9326,8 +9326,6 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
       cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
   VPBuilder ScalarPHBuilder(ScalarPH);
-  VPValue *OneVPV = Plan.getOrAddLiveIn(
-      ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
   for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
     auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
 
@@ -9362,7 +9360,7 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
            "Cannot handle loops with uncountable early exits");
     if (IsFOR)
       ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
-          VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {},
+          VPInstruction::ExtractLastLane, {ResumeFromVectorLoop}, {},
           "vector.recur.extract");
     StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
     auto *ResumePhiR = ScalarPHBuilder.createNaryOp(
@@ -9440,8 +9438,6 @@ static void addExitUsersForFirstOrderRecurrences(
   auto *MiddleVPBB = Plan.getMiddleBlock();
   VPBuilder ScalarPHBuilder(ScalarPHVPBB);
   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
-  VPValue *TwoVPV = Plan.getOrAddLiveIn(
-      ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2));
 
   for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
     auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
@@ -9525,7 +9521,7 @@ static void addExitUsersForFirstOrderRecurrences(
       if (ExitIRI->getOperand(0) != FOR)
         continue;
       VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
-          VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
+          VPInstruction::ExtractPenultimateLane, {FOR->getBackedgeValue()}, {},
           "vector.recur.extract.for.phi");
       ExitIRI->setOperand(0, PenultimateElement);
       ExitUsersToFix.remove(ExitIRI);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0f2aac146e7a6..94382a2d7af57 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -878,10 +878,10 @@ class VPInstruction : public VPRecipeWithIRFlags,
     Broadcast,
     ComputeFindLastIVResult,
     ComputeReductionResult,
-    // Takes the VPValue to extract from as first operand and the lane or part
-    // to extract as second operand, counting from the end starting with 1 for
-    // last. The second operand must be a positive constant and <= VF.
-    ExtractFromEnd,
+    // Extracts the last lane from its operand.
+    ExtractLastLane,
+    // Extracts the second-to-last lane from its operand.
+    ExtractPenultimateLane,
     LogicalAnd, // Non-poison propagating logical And.
     // Add an offset in bytes (second operand) to a base pointer (first
     // operand). Only generates scalar values (either for the first lane only or
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 375d4c9787994..946f1db4a8885 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -88,7 +88,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
     return SetResultTyFromOp();
   case VPInstruction::FirstActiveLane:
     return Type::getIntNTy(Ctx, 64);
-  case VPInstruction::ExtractFromEnd: {
+  case VPInstruction::ExtractLastLane:
+  case VPInstruction::ExtractPenultimateLane: {
     Type *BaseTy = inferScalarType(R->getOperand(0));
     if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
       return VecTy->getElementType();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index a342d5182c974..d6fbfb56b967a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -725,10 +725,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
 
     return ReducedPartRdx;
   }
-  case VPInstruction::ExtractFromEnd: {
-    auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
-    unsigned Offset = CI->getZExtValue();
-    assert(Offset > 0 && "Offset from end must be positive");
+  case VPInstruction::ExtractLastLane:
+  case VPInstruction::ExtractPenultimateLane: {
+    unsigned Offset = getOpcode() == VPInstruction::ExtractLastLane ? 1 : 2;
     Value *Res;
     if (State.VF.isVector()) {
       assert(Offset <= State.VF.getKnownMinValue() &&
@@ -854,7 +853,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
 }
 
 bool VPInstruction::isVectorToScalar() const {
-  return getOpcode() == VPInstruction::ExtractFromEnd ||
+  return getOpcode() == VPInstruction::ExtractLastLane ||
+         getOpcode() == VPInstruction::ExtractPenultimateLane ||
          getOpcode() == Instruction::ExtractElement ||
          getOpcode() == VPInstruction::FirstActiveLane ||
          getOpcode() == VPInstruction::ComputeFindLastIVResult ||
@@ -924,7 +924,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case VPInstruction::AnyOf:
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
-  case VPInstruction::ExtractFromEnd:
+  case VPInstruction::ExtractLastLane:
+  case VPInstruction::ExtractPenultimateLane:
   case VPInstruction::FirstActiveLane:
   case VPInstruction::FirstOrderRecurrenceSplice:
   case VPInstruction::LogicalAnd:
@@ -1042,8 +1043,11 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::Broadcast:
     O << "broadcast";
     break;
-  case VPInstruction::ExtractFromEnd:
-    O << "extract-from-end";
+  case VPInstruction::ExtractLastLane:
+    O << "extract-last-lane";
+    break;
+  case VPInstruction::ExtractPenultimateLane:
+    O << "extract-penultimate-lane";
     break;
   case VPInstruction::ComputeFindLastIVResult:
     O << "compute-find-last-iv-result";
@@ -1143,12 +1147,7 @@ void VPIRInstruction::extractLastLaneOfOperand(VPBuilder &Builder) {
   assert(getNumOperands() == 1 && "must have a single operand");
   VPValue *Exiting = getOperand(0);
   if (!Exiting->isLiveIn()) {
-    LLVMContext &Ctx = getInstruction().getContext();
-    auto &Plan = *getParent()->getPlan();
-    Exiting = Builder.createNaryOp(
-        VPInstruction::ExtractFromEnd,
-        {Exiting,
-         Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::get(Ctx, 32), 1))});
+    Exiting = Builder.createNaryOp(VPInstruction::ExtractLastLane, {Exiting});
   }
   setOperand(0, Exiting);
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f2dc68b2ea8b6..f5e4a466f4b37 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -827,8 +827,8 @@ optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo,
   using namespace VPlanPatternMatch;
 
   VPValue *Incoming;
-  if (!match(Op, m_VPInstruction<VPInstruction::ExtractFromEnd>(
-                     m_VPValue(Incoming), m_SpecificInt(1))))
+  if (!match(Op, m_VPInstruction<VPInstruction::ExtractLastLane>(
+                     m_VPValue(Incoming))))
     return nullptr;
 
   auto *WideIV = getOptimizableIVOf(Incoming);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index b48a447834cc8..e404b1d09b452 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -337,14 +337,19 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
       continue;
     }
     VPValue *Op0;
-    if (match(&R, m_VPInstruction<VPInstruction::ExtractFromEnd>(
-                      m_VPValue(Op0), m_VPValue(Op1)))) {
+    if (match(&R, m_VPInstruction<VPInstruction::ExtractLastLane>(
+                      m_VPValue(Op0))) ||
+        match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateLane>(
+                      m_VPValue(Op0)))) {
       addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
       if (Plan.hasScalarVFOnly()) {
         // Extracting from end with VF = 1 implies retrieving the scalar part UF
         // - Op1.
         unsigned Offset =
-            cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue();
+            match(&R,
+                  m_VPInstruction<VPInstruction::ExtractLastLane>(m_VPValue()))
+                ? 1
+                : 2;
         R.getVPSingleValue()->replaceAllUsesWith(
             getValueForPart(Op0, UF - Offset));
         R.eraseFromParent();
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index 24badfb7b1042..ef72d56d082d8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -42,7 +42,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
 ; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]>
-; CHECK-NEXT:   EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[EXTRACT:%.+]]> = extract-last-lane vp<[[RED_RESULT]]>
 ; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
@@ -107,7 +107,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
 ; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<%accum>, ir<%add>
-; CHECK-NEXT:   EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[EXTRACT:%.+]]> = extract-last-lane vp<[[RED_RESULT]]>
 ; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, ir<1024>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index bfaa09241e30b..4fcb2c9fe96c3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -62,7 +62,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-EMPTY:
 ; IF-EVL-OUTLOOP-NEXT: middle.block:
 ; IF-EVL-OUTLOOP-NEXT:   EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, vp<[[RDX_SELECT]]>
-; IF-EVL-OUTLOOP-NEXT:   EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
+; IF-EVL-OUTLOOP-NEXT:   EMIT vp<[[RDX_EX:%.+]]> = extract-last-lane vp<[[RDX]]>
 ; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>
 ; IF-EVL-OUTLOOP-EMPTY:
 ; IF-EVL-OUTLOOP-NEXT: ir-bb<for.end>:
@@ -102,7 +102,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: middle.block:
 ; IF-EVL-INLOOP-NEXT:   EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
-; IF-EVL-INLOOP-NEXT:   EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
+; IF-EVL-INLOOP-NEXT:   EMIT vp<[[RDX_EX:%.+]]> = extract-last-lane vp<[[RDX]]>
 ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.end>
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: ir-bb<for.end>:
@@ -137,7 +137,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: middle.block:
 ; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
-; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
+; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[RDX_EX:%.+]]> = extract-last-lane vp<[[RDX]]>
 ; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
 ; NO-VP-OUTLOOP-NEXT:   EMIT branch-on-cond vp<[[BOC]]>
 ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
@@ -185,7 +185,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: middle.block:
 ; NO-VP-INLOOP-NEXT:   EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
-; NO-VP-INLOOP-NEXT:   EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
+; NO-VP-INLOOP-NEXT:   EMIT vp<[[RDX_EX:%.+]]> = extract-last-lane vp<[[RDX]]>
 ; NO-VP-INLOOP-NEXT:   EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
 ; NO-VP-INLOOP-NEXT:   EMIT branch-on-cond vp<[[BOC]]>
 ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
index 8624d9201a7c8..0d47163e109cc 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
@@ -37,8 +37,8 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
-; CHECK-NEXT:    EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%for.1.next>, ir<1>
-; CHECK-NEXT:    EMIT vp<[[RESUME_2:%.+]]>.1 = extract-from-end vp<[[FOR1_SPLICE]]>, ir<1>
+; CHECK-NEXT:    EMIT vp<[[RESUME_1:%.+]]> = extract-last-lane ir<%for.1.next>
+; CHECK-NEXT:    EMIT vp<[[RESUME_2:%.+]]>.1 = extract-last-lane vp<[[FOR1_SPLICE]]>
 ; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq ir<1000>, vp<[[VTC]]>
 ; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
@@ -117,9 +117,9 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
-; CHECK-NEXT:    EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%for.1.next>, ir<1>
-; CHECK-NEXT:    EMIT vp<[[RESUME_2:%.+]]>.1 = extract-from-end vp<[[FOR1_SPLICE]]>, ir<1>
-; CHECK-NEXT:    EMIT vp<[[RESUME_3:%.+]]>.2 = extract-from-end vp<[[FOR2_SPLICE]]>, ir<1>
+; CHECK-NEXT:    EMIT vp<[[RESUME_1:%.+]]> = extract-last-lane ir<%for.1.next>
+; CHECK-NEXT:    EMIT vp<[[RESUME_2:%.+]]>.1 = extract-last-lane vp<[[FOR1_SPLICE]]>
+; CHECK-NEXT:    EMIT vp<[[RESUME_3:%.+]]>.2 = extract-last-lane vp<[[FOR2_SPLICE]]>
 ; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq ir<1000>, vp<[[VTC]]>
 ; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
@@ -203,8 +203,8 @@ define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[EXT_X:%.+]]> = extract-from-end ir<%for.x.next>, ir<1>
-; CHECK-NEXT:   EMIT vp<[[EXT_Y:%.+]]>.1 = extract-from-end ir<%for.x.prev>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[EXT_X:%.+]]> = extract-last-lane ir<%for.x.next>
+; CHECK-NEXT:   EMIT vp<[[EXT_Y:%.+]]>.1 = extract-last-lane ir<%for.x.prev>
 ; CHECK-NEXT:   EMIT vp<[[MIDDLE_C:%.+]]> = icmp eq ir<4098>, vp<[[VTC]]>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_C]]>
 ; CHECK-NEXT: Successor(s): ir-bb<ret>, scalar.ph
@@ -282,8 +282,8 @@ define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[EXT_X:%.+]]> = extract-from-end ir<%for.x.next>, ir<1>
-; CHECK-NEXT:   EMIT vp<[[EXT_Y:%.+]]>.1 = extract-from-end ir<%for.x.prev>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[EXT_X:%.+]]> = extract-last-lane ir<%for.x.next>
+; CHECK-NEXT:   EMIT vp<[[EXT_Y:%.+]]>.1 = extract-last-lane ir<%for.x.prev>
 ; CHECK-NEXT:   EMIT vp<[[MIDDLE_C:%.+]]> = icmp eq ir<4098>, vp<[[VTC]]>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_C]]>
 ; CHECK-NEXT: Successor(s): ir-bb<ret>, scalar.ph
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 794ff99a68672..0805cf3bcb01b 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -214,7 +214,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
 ; CHECK-NEXT:  EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%and.red>, vp<[[SEL]]>
-; CHECK-NEXT:  EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1>
+; CHECK-NEXT:  EMIT vp<[[RED_EX:%.+]]> = extract-last-lane vp<[[RED_RES]]>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index 671ed2af52621..dd400737bef01 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -220,7 +220,7 @@ exit:
 ; DBG-NEXT: Successor(s): middle.block
 ; DBG-EMPTY:
 ; DBG-NEXT: middle.block:
-; DBG-NEXT:   EMIT vp<[[RESUME_1:%.+]]> = extract-from-end vp<[[SCALAR_STEPS]]>, ir<1>
+; DBG-NEXT:   EMIT vp<[[RESUME_1:%.+]]> = extract-last-lane vp<[[SCALAR_STEPS]]>
 ; DBG-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]>
 ; DBG-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
 ; DBG-NEXT: Successor(s): ir-bb<exit>, scalar.ph
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index d0d276ab967fa..0709372890dac 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -35,7 +35,7 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
 ; CHECK-NEXT:   EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%red>, ir<%red.next>
-; CHECK-NEXT:   EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[RED_EX:%.+]]> = extract-last-lane vp<[[RED_RES]]>
 ; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
@@ -176,7 +176,7 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
 ; CHECK-NEXT:   EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%sum.07>, ir<[[MULADD]]>
-; CHECK-NEXT:   EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[RED_EX:%.+]]> = extract-last-lane vp<[[RED_RES]]>
 ; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
@@ -238,7 +238,7 @@ define i64 @find_last_iv(ptr %a, i64 %n, i64 %start) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
 ; CHECK-NEXT:   EMIT vp<[[RDX_RES:%.+]]> = compute-find-last-iv-result ir<%rdx>, ir<%start>, ir<%cond>
-; CHECK-NEXT:   EMIT vp<[[EXT:%.+]]> = extract-from-end vp<[[RDX_RES]]>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[EXT:%.+]]> = extract-last-lane vp<[[RDX_RES]]>
 ; CHECK-NEXT:   EMIT vp<%cmp.n> = icmp eq ir<%n>, vp<{{.+}}>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<%cmp.n>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index fc80d348042a0..b4e97a401c58e 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -538,7 +538,7 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[EXIT:%.+]]> = extract-from-end ir<%add>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[EXIT:%.+]]> = extr...
[truncated]

ayalz

Specializing to two distinct opcodes with suitable names LGTM!
ExtractFromEnd works for all vectors, naturally limiting its offset to at most 2.
The usages of last vs. penultimate are conceptually distinct, naturally limiting its offset to be an immediate/immutable value.

Another option is to encode the immediate offset using an IRFlags flag (1 bit suffices), given that both extracts need no other flag. Would that be better?

ayalz · 2025-04-23T21:56:02Z

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

@@ -1143,12 +1147,7 @@ void VPIRInstruction::extractLastLaneOfOperand(VPBuilder &Builder) {
  assert(getNumOperands() == 1 && "must have a single operand");
  VPValue *Exiting = getOperand(0);
  if (!Exiting->isLiveIn()) {


Suggested change

if (!Exiting->isLiveIn()) {

if (Exiting->isLiveIn())

return;

done thanks

ayalz · 2025-04-23T22:03:49Z

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

        // Extracting from end with VF = 1 implies retrieving the scalar part UF
        // - Op1.


"Op1" is gone:

Suggested change

// Extracting from end with VF = 1 implies retrieving the scalar part UF

// - Op1.

// Extracting from end with VF = 1 implies retrieving the last or penultimate scalar part (UF-1 or UF-2).

In this case we're extracting a last/penultimate "Part" rather than "Lane". Perhaps better call the opcodes ExtractLast/ExtractPenultimate, or ExtractLastElement/ExtractPenultimateElement.

Updated, thanks

ayalz · 2025-04-23T22:12:49Z

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

+    if (match(&R, m_VPInstruction<VPInstruction::ExtractLastLane>(
+                      m_VPValue(Op0))) ||
+        match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateLane>(
+                      m_VPValue(Op0)))) {
      addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
      if (Plan.hasScalarVFOnly()) {


Handle simpler vector VF case first?

Can do separately, leaving as-is for now to not pull in unrelated changes.

ayalz · 2025-04-23T22:36:37Z

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

+    if (match(&R, m_VPInstruction<VPInstruction::ExtractLastLane>(
+                      m_VPValue(Op0))) ||
+        match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateLane>(
+                      m_VPValue(Op0)))) {
      addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
      if (Plan.hasScalarVFOnly()) {
        // Extracting from end with VF = 1 implies retrieving the scalar part UF
        // - Op1.
        unsigned Offset =


Trying to simplify:

Suggested change

unsigned Offset =

auto *I = dyn_cast<VPInstruction>(&R); // Place above for reuse. Similar to H below.

unsigned Offset = I->getOpcode() == VPInstruction::ExtractLast ? 1 : 2;

consistent with how VPInstruction::generate() computes Offset?

done, using cast, thanks

ayalz · 2025-04-23T22:41:19Z

llvm/lib/Transforms/Vectorize/VPlan.h

+    // Extracts the last lane from its operand.
+    ExtractLastLane,
+    // Extracts the second-to-last lane from its operand.
+    ExtractPenultimateLane,


Extract last applies to both lane (when VF>1) and part (when VF=1), so better call it ExtractLast?
Extract penultimate applies to lane only (demands VF>1), so ExtractPenultimateLane is accurate.

Updated, thanks

…-from-end

fhahn

Specializing to two distinct opcodes with suitable names LGTM! ExtractFromEnd works for all vectors, naturally limiting its offset to at most 2. The usages of last vs. penultimate are conceptually distinct, naturally limiting its offset to be an immediate/immutable value.

Another option is to encode the immediate offset using an IRFlags flag (1 bit suffices), given that both extracts need no other flag. Would that be better?

I had a look at other potential users of such immediate offsets, but it seems like there aren't any other takers at the moment, so it might be better to go with the different opcodes for now?

fhahn · 2025-04-24T20:29:43Z

llvm/lib/Transforms/Vectorize/VPlan.h

+    // Extracts the last lane from its operand.
+    ExtractLastLane,
+    // Extracts the second-to-last lane from its operand.
+    ExtractPenultimateLane,


Updated, thanks

fhahn · 2025-04-24T20:29:50Z

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

@@ -1143,12 +1147,7 @@ void VPIRInstruction::extractLastLaneOfOperand(VPBuilder &Builder) {
  assert(getNumOperands() == 1 && "must have a single operand");
  VPValue *Exiting = getOperand(0);
  if (!Exiting->isLiveIn()) {


done thanks

fhahn · 2025-04-24T20:30:29Z

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

+    if (match(&R, m_VPInstruction<VPInstruction::ExtractLastLane>(
+                      m_VPValue(Op0))) ||
+        match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateLane>(
+                      m_VPValue(Op0)))) {
      addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
      if (Plan.hasScalarVFOnly()) {


Can do separately, leaving as-is for now to not pull in unrelated changes.

fhahn · 2025-04-24T20:33:05Z

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

        // Extracting from end with VF = 1 implies retrieving the scalar part UF
        // - Op1.


Updated, thanks

fhahn · 2025-04-24T20:36:40Z

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

+    if (match(&R, m_VPInstruction<VPInstruction::ExtractLastLane>(
+                      m_VPValue(Op0))) ||
+        match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateLane>(
+                      m_VPValue(Op0)))) {
      addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
      if (Plan.hasScalarVFOnly()) {
        // Extracting from end with VF = 1 implies retrieving the scalar part UF
        // - Op1.
        unsigned Offset =


done, using cast, thanks

ayalz · 2025-04-25T08:07:35Z

llvm/lib/Transforms/Vectorize/VPlan.h

+    // Extracts the second-to-last lane from its operand.
+    ExtractPenultimateLane,


This probably should be further clarified: extract penultimate is used also for part when VF=1, but gets replaced by hooking up to penultimate part during unrolling-by-UF (right?). So during code-gen extract penultimate expects VF>1, i.e., it's per lane rather than per part; but so does extract last? So perhaps better rename ExtractPenultimate too, explaining that both apply to lane or part, where the latter are abstract - dissolved prior to code-gen?
Could also have ExtractLastElement and ExtractPenultimateElement, leaving the element to be either lane or full part.

Thanks, updated in the latest version to use Element suffix for both, mention that for the scalar case it extracts the last/second-to-last part and that in this case it gets removed by unrolling.

…-from-end

ayalz

LGTM, thanks!

ayalz · 2025-04-25T10:53:08Z

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

-        // Extracting from end with VF = 1 implies retrieving the scalar part UF
-        // - Op1.
+        auto *I = cast<VPInstruction>(&R);
+        // Extracting from end with VF = 1 implies retrieving retrieving the


Suggested change

// Extracting from end with VF = 1 implies retrieving retrieving the

// Extracting from end with VF = 1 implies retrieving the

…-from-end

…(NFC). (llvm#137030) ExtractFromEnd only has 2 uses, extracting the last and penultimate elements. Replace it with 2 separate opcodes, removing the need to materialize and handle a constant argument. PR: llvm#137030

…te)Element (NFC). (#137030) ExtractFromEnd only has 2 uses, extracting the last and penultimate elements. Replace it with 2 separate opcodes, removing the need to materialize and handle a constant argument. PR: llvm/llvm-project#137030

…(NFC). (llvm#137030) ExtractFromEnd only has 2 uses, extracting the last and penultimate elements. Replace it with 2 separate opcodes, removing the need to materialize and handle a constant argument. PR: llvm#137030

[VPlan] Replace ExtractFromEnd with Extract(Last|Penultimate)Lane (NFC).

876f977

ExtractFromEnd only has 2 uses, extracting the last and penultimate lanes. Replace it with 2 separate opcodes, removing the need to materialize and handle a constant argument.

fhahn requested review from Mel-Chen, ayalz and aniragil April 23, 2025 18:01

llvmbot added vectorizers llvm:transforms labels Apr 23, 2025

ayalz reviewed Apr 23, 2025

View reviewed changes

fhahn added 2 commits April 24, 2025 21:26

Merge remote-tracking branch 'origin/main' into vplan-replace-extract…

80d1d0e

…-from-end

!fixup address latest comments, thanks

65178dd

fhahn commented Apr 24, 2025

View reviewed changes

ayalz reviewed Apr 25, 2025

View reviewed changes

fhahn added 2 commits April 25, 2025 10:28

Merge remote-tracking branch 'origin/main' into vplan-replace-extract…

20740c7

…-from-end

!fixup Lane->Element

e5ecd86

ayalz approved these changes Apr 25, 2025

View reviewed changes

fhahn changed the title ~~[VPlan] Replace ExtractFromEnd with Extract(Last|Penultimate)Lane (NFC).~~ [VPlan] Replace ExtractFromEnd with Extract(Last|Penultimate)Element (NFC). Apr 25, 2025

fhahn added 2 commits April 25, 2025 15:13

Merge remote-tracking branch 'origin/main' into vplan-replace-extract…

1a5a728

…-from-end

!fixup remove retrieving

733f4b5

fhahn merged commit df21288 into llvm:main Apr 25, 2025
11 checks passed

fhahn deleted the vplan-replace-extract-from-end branch April 25, 2025 17:59

ayalz mentioned this pull request Jun 5, 2025

[LoopVectorize] Vectorize fixed-order recurrence with vscale x 1. #142772

Open

		// Extracting from end with VF = 1 implies retrieving the scalar part UF
		// - Op1.

	unsigned Offset =
	auto *I = dyn_cast<VPInstruction>(&R); // Place above for reuse. Similar to H below.
	unsigned Offset = I->getOpcode() == VPInstruction::ExtractLast ? 1 : 2;

		// Extracts the second-to-last lane from its operand.
		ExtractPenultimateLane,

[VPlan] Replace ExtractFromEnd with Extract(Last|Penultimate)Element (NFC). #137030

[VPlan] Replace ExtractFromEnd with Extract(Last|Penultimate)Element (NFC). #137030

Uh oh!

Conversation

fhahn commented Apr 23, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Apr 23, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ayalz left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

ayalz Apr 25, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

ayalz left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

fhahn commented Apr 23, 2025 •

edited

Loading

llvmbot commented Apr 23, 2025 •

edited

Loading

ayalz Apr 25, 2025 •

edited

Loading