[LV] Account for vp_merge in out of loop EVL reductions in legacy cost model (#115903)

lukel97 · web-flow · commit 9e77f5900591 · 2024-11-14T16:55:18.000+09:00
In #101641, support for out of loop reductions with EVL tail folding was added by transforming selects to vp_merges in transformRecipestoEVLRecipes. Whilst the select was previously free, the vp_merge wasn't and incurs a cost on RISC-V with the VPlan cost model. But this diverged from the legacy cost model and caused the "VPlan cost model and legacy cost model disagreed" assertion to trigger when building 502.gcc_r from SPEC CPU 2017. Neither the select nor vp_merge recipes from the VPlan exist in the underlying instructions, so I thought it would make the most sense to fix this by adding the cost to the underlying phi instruction in getInstructionCost. It's worth noting that on RISC-V this vp_merge won't actually generate any instructions because the mask is all true, and will be folded away. So we should update the cost model at some point to reflect that.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6566,6 +6566,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
     }
 
+    // When tail folding with EVL, if the phi is part of an out of loop
+    // reduction then it will be transformed into a wide vp_merge.
+    if (VF.isVector() && foldTailWithEVL() &&
+        Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) {
+      IntrinsicCostAttributes ICA(
+          Intrinsic::vp_merge, ToVectorTy(Phi->getType(), VF),
+          {ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
+      return TTI.getIntrinsicInstrCost(ICA, CostKind);
+    }
+
     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
   }
   case Instruction::UDiv:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll
@@ -0,0 +1,25 @@
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck %s
+
+; CHECK: Cost of 2 for VF vscale x 4: WIDEN-INTRINSIC vp<%{{.+}}> = call llvm.vp.merge(ir<true>, ir<%add>, ir<%rdx>, vp<%{{.+}}>)
+; CHECK: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction:   %rdx = phi i32 [ %start, %entry ], [ %add, %loop ]
+
+define i32 @add(ptr %a, i64 %n, i32 %start) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ %start, %entry ], [ %add, %loop ]
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %add = add nsw i32 %0, %rdx
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret i32 %add
+}