diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 8f4ce80ada5ed..20c4b52098770 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1822,6 +1822,10 @@ class TargetTransformInfo { /// otherwise scalar epilogue loop. LLVM_ABI bool preferEpilogueVectorization() const; + /// Return true if the loop vectorizer shoud consider vectorizing with + /// flattern control flow, otherwise create conditional vector basic block. + bool preferFlattenControlFlow() const; + /// \returns True if the target wants to expand the given reduction intrinsic /// into a shuffle sequence. LLVM_ABI bool shouldExpandReduction(const IntrinsicInst *II) const; diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index a80b4c5179bad..1211c80f8ff51 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1091,6 +1091,8 @@ class TargetTransformInfoImplBase { virtual bool preferEpilogueVectorization() const { return true; } + virtual bool preferFlattenControlFlow() const { return true; } + virtual bool shouldExpandReduction(const IntrinsicInst *II) const { return true; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index ff8778168686d..5000bef968213 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -777,6 +777,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return BaseT::preferPredicateOverEpilogue(TFI); } + bool preferFlattenControlFlow() const override { + return thisT()->preferFlattenControlFlow(); + } + TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const override { return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 0f857399660fe..d455e1dc63c13 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -371,6 +371,10 @@ bool TargetTransformInfo::preferPredicateOverEpilogue( return TTIImpl->preferPredicateOverEpilogue(TFI); } +bool TargetTransformInfo::preferFlattenControlFlow() const { + return TTIImpl->preferFlattenControlFlow(); +} + TailFoldingStyle TargetTransformInfo::getPreferredTailFoldingStyle( bool IVUpdateMayOverflow) const { return TTIImpl->getPreferredTailFoldingStyle(IVUpdateMayOverflow); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 0a784461d67bf..70f42a83493a9 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -146,6 +146,8 @@ class RISCVTTIImpl : public BasicTTIImplBase { return false; } + bool preferFlattenControlFlow() const { return true; } + InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 2fe59a464457f..e136ec35b9002 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -351,6 +351,11 @@ static cl::opt PreferPredicatedReductionSelect( cl::desc( "Prefer predicating a reduction operation over an after loop select.")); +static cl::opt + PreferFlattenControlFlow("prefer-flatten-control-flow", cl::init(true), + cl::Hidden, + cl::desc("Prefer flatten control flow.")); + cl::opt llvm::EnableVPlanNativePath( "enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " @@ -9287,6 +9292,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( } VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues); + if (!PreferFlattenControlFlow || !TTI.preferFlattenControlFlow()) + VPlanTransforms::optimizeConditionalVPBB(*Plan); + assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index dc5be520505eb..e1c8690986599 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2087,6 +2087,23 @@ void VPlanTransforms::addActiveLaneMask( HeaderMask->replaceAllUsesWith(LaneMask); } +static bool replaceHeaderMaskToEVL(VPValue *HeaderMask, VPRecipeBase *R) { + using namespace llvm::VPlanPatternMatch; + VPValue *EdgeMask; + if (!R) + return false; + if (match(R, m_Binary( + m_VPInstruction( + m_Binary( + m_Specific(HeaderMask), m_VPValue(EdgeMask))), + m_VPValue()))) { + + cast(R->getOperand(0))->setOperand(0, EdgeMask); + return true; + } + return false; +} + /// Try to convert \p CurRecipe to a corresponding EVL-based recipe. Returns /// nullptr if no EVL-based recipe could be created. /// \p HeaderMask Header Mask. @@ -2202,6 +2219,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { for (VPUser *U : collectUsersRecursively(HeaderMask)) { auto *CurRecipe = cast(U); + if (replaceHeaderMaskToEVL(HeaderMask, CurRecipe)) + continue; VPRecipeBase *EVLRecipe = createEVLRecipe( HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL, PrevEVL); if (!EVLRecipe) @@ -3202,3 +3221,160 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1))); removeDeadRecipes(Plan); } + +void VPlanTransforms::optimizeConditionalVPBB(VPlan &Plan) { + + VPDominatorTree VPDT; + VPDT.recalculate(Plan); + + SmallVector HeaderMasks = collectAllHeaderMasks(Plan); + + // Get the mask from the store recipes. + auto GetMask = [&HeaderMasks](VPRecipeBase &R) -> VPValue * { + using namespace llvm::VPlanPatternMatch; + if (isa(R)) { + VPValue *OrigMask = cast(R).getMask(); + if (!OrigMask) + return OrigMask; + + if (any_of(HeaderMasks, [OrigMask](VPValue *HeaderMask) { + return OrigMask == HeaderMask; + })) + return nullptr; + + // Match active.lane.mask. + if (match(OrigMask, m_VPInstruction( + m_VPValue(), m_VPValue()))) + return nullptr; + + return OrigMask; + } + return nullptr; + }; + + // First, collect all masked stores. + SmallVector> MaskedStores; + ReversePostOrderTraversal> RPOT( + Plan.getEntry()); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { + for (VPRecipeBase &R : *VPBB) { + if (VPValue *Mask = GetMask(R)) + MaskedStores.emplace_back(&R, Mask); + } + } + + DenseSet Candidates; + auto AddOperandsToCandidates = [&Candidates](VPRecipeBase *R) { + for (VPValue *Op : R->operands()) + if (VPRecipeBase *OpR = Op->getDefiningRecipe()) + Candidates.insert(OpR); + }; + + SmallVector> Tries; + while (!MaskedStores.empty()) { + auto [LR, M] = MaskedStores.pop_back_val(); + Candidates.clear(); + AddOperandsToCandidates(LR); + + SetVector CurrentTree; + CurrentTree.insert(LR); + + VPBasicBlock *MaskBlock = + M->hasDefiningRecipe() ? M->getDefiningRecipe()->getParent() : nullptr; + auto End = MaskBlock == LR->getParent() + ? M->getDefiningRecipe()->getReverseIterator() + : LR->getParent()->getFirstNonPhi()->getReverseIterator(); + // Greedily add all recipes that are used to compute stored value to the + // tree. All users of the added recipe must dominate the store + // recipe. + for (VPRecipeBase &R : make_range(LR->getReverseIterator(), End)) { + // Recipe is not a part of the tree + if (!Candidates.contains(&R)) + continue; + + if (any_of(R.definedValues(), [&LR = LR, &VPDT](VPValue *Def) { + for (VPUser *U : Def->users()) { + if (auto *UR = dyn_cast(U)) { + if (UR == LR || VPDT.properlyDominates(UR, LR)) + continue; + } + return true; + } + return false; + })) + continue; + + CurrentTree.insert(&R); + AddOperandsToCandidates(&R); + } + // Previous traversal could add recipes that are used by non-added recipes, + // thus need to be removed from the list. + DenseSet ToRemove; + bool Changed; + do { + Changed = false; + for (VPRecipeBase *R : CurrentTree) { + if (ToRemove.contains(R)) + continue; + if (any_of(R->definedValues(), [&](VPValue *Def) { + for (VPUser *U : Def->users()) { + if (auto *UR = dyn_cast(U)) + if (!CurrentTree.contains(UR) || ToRemove.contains(UR)) + return true; + } + return false; + })) { + Changed = true; + ToRemove.insert(R); + } + } + } while (Changed); + + for (VPRecipeBase *R : ToRemove) + CurrentTree.remove(R); + + if (CurrentTree.size() > 1) + Tries.push_back(CurrentTree); + } + for (const auto &List : Tries) { + VPRecipeBase *LR = List.front(); + VPValue *M = cast(LR)->getMask(); + assert(M && "Mask VPValue must exist at this point"); + auto Recipes = reverse(List.getArrayRef()); + + // Split current basic block at LR point so that VPConditionalRegionBlock + // can be added inbetween. + VPBasicBlock *ParentBB = LR->getParent(); + VPBasicBlock *ContBB = ParentBB->splitAt(LR->getIterator()); + + // Create VPBB and insert it between ParentBB and ContBB. + VPBasicBlock *IfBB = Plan.createVPBasicBlock("vector.if.bb"); + VPBlockUtils::insertBlockAfter(IfBB, ParentBB); + if (ContBB->getNumSuccessors() == 0) + ParentBB->getEnclosingLoopRegion()->setExiting(ContBB); + + // Copy recipes into conditional block. + for (VPRecipeBase *R : Recipes) + R->moveBefore(*IfBB, IfBB->end()); + + // Add the condition and brach in the parent block. + auto *ActiveLane = + new VPInstruction(VPInstruction::AnyOf, {M}, nullptr, "any.of.mask"); + + Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType(); + LLVMContext &Ctx = CanonicalIVType->getContext(); + VPValue *Zero = + Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt1Ty(Ctx), 0)); + + auto *BranchOnCount = + new VPInstruction(VPInstruction::BranchOnCount, {ActiveLane, Zero}); + ParentBB->appendRecipe(ActiveLane); + ParentBB->appendRecipe(BranchOnCount); + + // Set proper predecessor and successors for modifed basicblocks. + ParentBB->clearSuccessors(); + ParentBB->setTwoSuccessors(ContBB, IfBB); + ContBB->clearPredecessors(); + ContBB->setPredecessors({ParentBB, IfBB}); + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 34e2de4eb3b74..0e535d6bf7c36 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -234,6 +234,29 @@ struct VPlanTransforms { /// removed in the future. static DenseMap introduceMasksAndLinearize(VPlan &Plan, bool FoldTail); + + /// Try to convet flatten control flow to the conditional vector basic block. + /// If no active bits in the mask, will skip all the masked operations. + /// This transformation will collect all masked operations bottom-up from the + /// masked stores and put all of masked operations in a new vector basic + /// block. This original vector.loop will be split and the new created basic + /// block will inserted in between. + /// + /// After transformation the vplan will looks like. + /// vector.loop: + /// ... + /// %any.active.mask = any-of(%Mask) + /// Branch-On-Count %any.active.mask, 0 + /// successors vector.loop.split, vector.if.bb + /// + /// vector.if.bb: + /// (Masked operations) + /// successors vector.loop.split + /// + /// vector.loop.split: + /// ... + /// successors middle.block, vector.loop + static void optimizeConditionalVPBB(VPlan &Plan); }; } // namespace llvm diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll new file mode 100644 index 0000000000000..2b8f2542ab544 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v -prefer-flatten-control-flow=false %s | FileCheck %s + +define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr %reg.24.val) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: i32 [[CONTROL1:%.*]], i32 [[CONTROL2:%.*]], i32 [[TARGET:%.*]], i32 [[REG_4_VAL:%.*]], ptr [[REG_24_VAL:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[REG_4_VAL]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY_LR_PH]]: +; CHECK-NEXT: [[SH_PROM:%.*]] = zext nneg i32 [[CONTROL1]] to i64 +; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 1, [[SH_PROM]] +; CHECK-NEXT: [[SH_PROM5:%.*]] = zext nneg i32 [[CONTROL2]] to i64 +; CHECK-NEXT: [[SHL6:%.*]] = shl nuw i64 1, [[SH_PROM5]] +; CHECK-NEXT: [[SH_PROM10:%.*]] = zext nneg i32 [[TARGET]] to i64 +; CHECK-NEXT: [[SHL11:%.*]] = shl nuw nsw i64 1, [[SH_PROM10]] +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[REG_4_VAL]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = freeze i64 [[SHL6]] +; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[SHL]], [[TMP0]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[SHL11]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[IF_THEN9_SPLIT:.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[REG_24_VAL]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]]) +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i1 [[TMP13]], false +; CHECK-NEXT: br i1 [[TMP14]], label %[[IF_THEN9_SPLIT]], label %[[VECTOR_IF_BB:.*]] +; CHECK: [[VECTOR_IF_BB]]: +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4 +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr [[TMP11]], i32 8, <4 x i1> [[TMP7]]) +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr [[TMP12]], i32 8, <4 x i1> [[TMP8]]) +; CHECK-NEXT: br label %[[IF_THEN9_SPLIT]] +; CHECK: [[IF_THEN9_SPLIT]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[REG_24_VAL]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[TMP28:%.*]] = and i64 [[TMP27]], [[TMP1]] +; CHECK-NEXT: [[OR_COND_NOT:%.*]] = icmp eq i64 [[TMP28]], [[TMP1]] +; CHECK-NEXT: br i1 [[OR_COND_NOT]], label %[[IF_THEN9:.*]], label %[[FOR_INC]] +; CHECK: [[IF_THEN9]]: +; CHECK-NEXT: [[XOR:%.*]] = xor i64 [[TMP27]], [[SHL11]] +; CHECK-NEXT: store i64 [[XOR]], ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret void +; +entry: + %cmp1 = icmp sgt i32 %reg.4.val, 0 + br i1 %cmp1, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: + %sh_prom = zext nneg i32 %control1 to i64 + %shl = shl nuw i64 1, %sh_prom + %sh_prom5 = zext nneg i32 %control2 to i64 + %shl6 = shl nuw i64 1, %sh_prom5 + %sh_prom10 = zext nneg i32 %target to i64 + %shl11 = shl nuw nsw i64 1, %sh_prom10 + %wide.trip.count = zext nneg i32 %reg.4.val to i64 + %0 = freeze i64 %shl6 + %1 = or i64 %shl, %0 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.inc ] + %arrayidx = getelementptr inbounds i64, ptr %reg.24.val, i64 %indvars.iv + %2 = load i64, ptr %arrayidx, align 8 + %3 = and i64 %2, %1 + %or.cond.not = icmp eq i64 %3, %1 + br i1 %or.cond.not, label %if.then9, label %for.inc + +if.then9: + %xor = xor i64 %2, %shl11 + store i64 %xor, ptr %arrayidx, align 8 + br label %for.inc + +for.inc: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;.