Skip to content

[LV][RFC] Generating conditional VPBB that will be skip when the mask is inactive in VPlan. #141900

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1822,6 +1822,10 @@ class TargetTransformInfo {
/// otherwise scalar epilogue loop.
LLVM_ABI bool preferEpilogueVectorization() const;

/// Return true if the loop vectorizer shoud consider vectorizing with
/// flattern control flow, otherwise create conditional vector basic block.
bool preferFlattenControlFlow() const;

/// \returns True if the target wants to expand the given reduction intrinsic
/// into a shuffle sequence.
LLVM_ABI bool shouldExpandReduction(const IntrinsicInst *II) const;
Expand Down
2 changes: 2 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1091,6 +1091,8 @@ class TargetTransformInfoImplBase {

virtual bool preferEpilogueVectorization() const { return true; }

virtual bool preferFlattenControlFlow() const { return true; }

virtual bool shouldExpandReduction(const IntrinsicInst *II) const {
return true;
}
Expand Down
4 changes: 4 additions & 0 deletions llvm/include/llvm/CodeGen/BasicTTIImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -777,6 +777,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return BaseT::preferPredicateOverEpilogue(TFI);
}

bool preferFlattenControlFlow() const override {
return thisT()->preferFlattenControlFlow();
}

TailFoldingStyle
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const override {
return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,10 @@ bool TargetTransformInfo::preferPredicateOverEpilogue(
return TTIImpl->preferPredicateOverEpilogue(TFI);
}

bool TargetTransformInfo::preferFlattenControlFlow() const {
return TTIImpl->preferFlattenControlFlow();
}

TailFoldingStyle TargetTransformInfo::getPreferredTailFoldingStyle(
bool IVUpdateMayOverflow) const {
return TTIImpl->getPreferredTailFoldingStyle(IVUpdateMayOverflow);
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
return false;
}

bool preferFlattenControlFlow() const { return false; }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should make the decision based on branch probability?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for your comment!

Yeah, I think it will be better to default disable the conditional VPBB and just expose the TTI hook here for future per CPU tuning.


InstructionCost
getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
unsigned AddressSpace,
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,11 @@ static cl::opt<bool> PreferPredicatedReductionSelect(
cl::desc(
"Prefer predicating a reduction operation over an after loop select."));

static cl::opt<bool>
PreferFlattenControlFlow("prefer-flatten-control-flow", cl::init(true),
cl::Hidden,
cl::desc("Prefer flatten control flow."));

cl::opt<bool> llvm::EnableVPlanNativePath(
"enable-vplan-native-path", cl::Hidden,
cl::desc("Enable VPlan-native vectorization path with "
Expand Down Expand Up @@ -9287,6 +9292,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
}
VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues);

if (!PreferFlattenControlFlow && !TTI.preferFlattenControlFlow())
VPlanTransforms::optimizeConditionalVPBB(*Plan);

assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
}
Expand Down
176 changes: 176 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2087,6 +2087,23 @@ void VPlanTransforms::addActiveLaneMask(
HeaderMask->replaceAllUsesWith(LaneMask);
}

static bool replaceHeaderMaskToEVL(VPValue *HeaderMask, VPRecipeBase *R) {
using namespace llvm::VPlanPatternMatch;
VPValue *EdgeMask;
if (!R)
return false;
if (match(R, m_Binary<VPInstruction::BranchOnCount>(
m_VPInstruction<VPInstruction::AnyOf>(
m_Binary<VPInstruction::LogicalAnd>(
m_Specific(HeaderMask), m_VPValue(EdgeMask))),
m_VPValue()))) {

cast<VPInstruction>(R->getOperand(0))->setOperand(0, EdgeMask);
return true;
}
return false;
}

/// Try to convert \p CurRecipe to a corresponding EVL-based recipe. Returns
/// nullptr if no EVL-based recipe could be created.
/// \p HeaderMask Header Mask.
Expand Down Expand Up @@ -2202,6 +2219,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
auto *CurRecipe = cast<VPRecipeBase>(U);
if (replaceHeaderMaskToEVL(HeaderMask, CurRecipe))
continue;
VPRecipeBase *EVLRecipe = createEVLRecipe(
HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL, PrevEVL);
if (!EVLRecipe)
Expand Down Expand Up @@ -3202,3 +3221,160 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
removeDeadRecipes(Plan);
}

void VPlanTransforms::optimizeConditionalVPBB(VPlan &Plan) {

VPDominatorTree VPDT;
VPDT.recalculate(Plan);

SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);

// Get the mask from the store recipes.
auto GetMask = [&HeaderMasks](VPRecipeBase &R) -> VPValue * {
using namespace llvm::VPlanPatternMatch;
if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(R)) {
VPValue *OrigMask = cast<VPWidenMemoryRecipe>(R).getMask();
if (!OrigMask)
return OrigMask;

if (any_of(HeaderMasks, [OrigMask](VPValue *HeaderMask) {
return OrigMask == HeaderMask;
}))
return nullptr;

// Match active.lane.mask.
if (match(OrigMask, m_VPInstruction<VPInstruction::ActiveLaneMask>(
m_VPValue(), m_VPValue())))
return nullptr;

return OrigMask;
}
return nullptr;
};

// First, collect all masked stores.
SmallVector<std::pair<VPRecipeBase *, VPValue *>> MaskedStores;
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
Plan.getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
for (VPRecipeBase &R : *VPBB) {
if (VPValue *Mask = GetMask(R))
MaskedStores.emplace_back(&R, Mask);
}
}

DenseSet<VPRecipeBase *> Candidates;
auto AddOperandsToCandidates = [&Candidates](VPRecipeBase *R) {
for (VPValue *Op : R->operands())
if (VPRecipeBase *OpR = Op->getDefiningRecipe())
Candidates.insert(OpR);
};

SmallVector<SetVector<VPRecipeBase *>> Tries;
while (!MaskedStores.empty()) {
auto [LR, M] = MaskedStores.pop_back_val();
Candidates.clear();
AddOperandsToCandidates(LR);

SetVector<VPRecipeBase *> CurrentTree;
CurrentTree.insert(LR);

VPBasicBlock *MaskBlock =
M->hasDefiningRecipe() ? M->getDefiningRecipe()->getParent() : nullptr;
auto End = MaskBlock == LR->getParent()
? M->getDefiningRecipe()->getReverseIterator()
: LR->getParent()->getFirstNonPhi()->getReverseIterator();
// Greedily add all recipes that are used to compute stored value to the
// tree. All users of the added recipe must dominate the store
// recipe.
for (VPRecipeBase &R : make_range(LR->getReverseIterator(), End)) {
// Recipe is not a part of the tree
if (!Candidates.contains(&R))
continue;

if (any_of(R.definedValues(), [&LR = LR, &VPDT](VPValue *Def) {
for (VPUser *U : Def->users()) {
if (auto *UR = dyn_cast<VPRecipeBase>(U)) {
if (UR == LR || VPDT.properlyDominates(UR, LR))
continue;
}
return true;
}
return false;
}))
continue;

CurrentTree.insert(&R);
AddOperandsToCandidates(&R);
}
// Previous traversal could add recipes that are used by non-added recipes,
// thus need to be removed from the list.
DenseSet<VPRecipeBase *> ToRemove;
bool Changed;
do {
Changed = false;
for (VPRecipeBase *R : CurrentTree) {
if (ToRemove.contains(R))
continue;
if (any_of(R->definedValues(), [&](VPValue *Def) {
for (VPUser *U : Def->users()) {
if (auto *UR = dyn_cast<VPRecipeBase>(U))
if (!CurrentTree.contains(UR) || ToRemove.contains(UR))
return true;
}
return false;
})) {
Changed = true;
ToRemove.insert(R);
}
}
} while (Changed);

for (VPRecipeBase *R : ToRemove)
CurrentTree.remove(R);

if (CurrentTree.size() > 1)
Tries.push_back(CurrentTree);
}
for (const auto &List : Tries) {
VPRecipeBase *LR = List.front();
VPValue *M = cast<VPWidenMemoryRecipe>(LR)->getMask();
assert(M && "Mask VPValue must exist at this point");
auto Recipes = reverse(List.getArrayRef());

// Split current basic block at LR point so that VPConditionalRegionBlock
// can be added inbetween.
VPBasicBlock *ParentBB = LR->getParent();
VPBasicBlock *ContBB = ParentBB->splitAt(LR->getIterator());

// Create VPBB and insert it between ParentBB and ContBB.
VPBasicBlock *IfBB = Plan.createVPBasicBlock("vector.if.bb");
VPBlockUtils::insertBlockAfter(IfBB, ParentBB);
if (ContBB->getNumSuccessors() == 0)
ParentBB->getEnclosingLoopRegion()->setExiting(ContBB);

// Copy recipes into conditional block.
for (VPRecipeBase *R : Recipes)
R->moveBefore(*IfBB, IfBB->end());

// Add the condition and brach in the parent block.
auto *ActiveLane =
new VPInstruction(VPInstruction::AnyOf, {M}, nullptr, "any.of.mask");

Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
LLVMContext &Ctx = CanonicalIVType->getContext();
VPValue *Zero =
Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt1Ty(Ctx), 0));

auto *BranchOnCount =
new VPInstruction(VPInstruction::BranchOnCount, {ActiveLane, Zero});
ParentBB->appendRecipe(ActiveLane);
ParentBB->appendRecipe(BranchOnCount);

// Set proper predecessor and successors for modifed basicblocks.
ParentBB->clearSuccessors();
ParentBB->setTwoSuccessors(ContBB, IfBB);
ContBB->clearPredecessors();
ContBB->setPredecessors({ParentBB, IfBB});
}
}
23 changes: 23 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,29 @@ struct VPlanTransforms {
/// removed in the future.
static DenseMap<VPBasicBlock *, VPValue *>
introduceMasksAndLinearize(VPlan &Plan, bool FoldTail);

/// Try to convet flatten control flow to the conditional vector basic block.
/// If no active bits in the mask, will skip all the masked operations.
/// This transformation will collect all masked operations bottom-up from the
/// masked stores and put all of masked operations in a new vector basic
/// block. This original vector.loop will be split and the new created basic
/// block will inserted in between.
///
/// After transformation the vplan will looks like.
/// vector.loop:
/// ...
/// %any.active.mask = any-of(%Mask)
/// Branch-On-Count %any.active.mask, 0
/// successors vector.loop.split, vector.if.bb
///
/// vector.if.bb:
/// (Masked operations)
/// successors vector.loop.split
///
/// vector.loop.split:
/// ...
/// successors middle.block, vector.loop
static void optimizeConditionalVPBB(VPlan &Plan);
};

} // namespace llvm
Expand Down
Loading
Loading