Skip to content

Commit 4ab5d34

Browse files
committed
[LoopVectorizer] Prune VFs based on plan register pressure
Based on fhahn's work at llvm#126437 . This PR moves the register usage checking to after the plans are created, so that any recipes that optimise register usage (such as partial reductions) can be properly costed and not have their VF pruned unnecessarily. It involves changing some tests, notably removing one from mve-known-tripcount.ll due to it not being vectorisable thanks to high register pressure. tail-folding-reduces-vf.ll was modified to reduce its register pressure but still test what was intended.
1 parent bcd7f54 commit 4ab5d34

File tree

10 files changed

+286
-662
lines changed

10 files changed

+286
-662
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 46 additions & 234 deletions
Original file line numberDiff line numberDiff line change
@@ -1022,11 +1022,6 @@ class LoopVectorizationCostModel {
10221022
SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
10231023
};
10241024

1025-
/// \return Returns information about the register usages of the loop for the
1026-
/// given vectorization factors.
1027-
SmallVector<RegisterUsage, 8>
1028-
calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1029-
10301025
/// Collect values we want to ignore in the cost model.
10311026
void collectValuesToIgnore();
10321027

@@ -4189,27 +4184,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
41894184
ComputeScalableMaxVF);
41904185
MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
41914186

4192-
// Collect all viable vectorization factors larger than the default MaxVF
4193-
// (i.e. MaxVectorElementCount).
4194-
SmallVector<ElementCount, 8> VFs;
4187+
// Set the max VF to the largest viable vectorization factor less than or
4188+
// equal to the max vector element count.
41954189
for (ElementCount VS = MaxVectorElementCount * 2;
41964190
ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4197-
VFs.push_back(VS);
4198-
4199-
// For each VF calculate its register usage.
4200-
auto RUs = calculateRegisterUsage(VFs);
4191+
MaxVF = VS;
42014192

4202-
// Select the largest VF which doesn't require more registers than existing
4203-
// ones.
4204-
for (int I = RUs.size() - 1; I >= 0; --I) {
4205-
const auto &MLU = RUs[I].MaxLocalUsers;
4206-
if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4207-
return LU.second <= TTI.getNumberOfRegisters(LU.first);
4208-
})) {
4209-
MaxVF = VFs[I];
4210-
break;
4211-
}
4212-
}
42134193
if (ElementCount MinVF =
42144194
TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
42154195
if (ElementCount::isKnownLT(MaxVF, MinVF)) {
@@ -5392,213 +5372,6 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
53925372
return 1;
53935373
}
53945374

5395-
SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5396-
LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5397-
// This function calculates the register usage by measuring the highest number
5398-
// of values that are alive at a single location. Obviously, this is a very
5399-
// rough estimation. We scan the loop in a topological order in order and
5400-
// assign a number to each instruction. We use RPO to ensure that defs are
5401-
// met before their users. We assume that each instruction that has in-loop
5402-
// users starts an interval. We record every time that an in-loop value is
5403-
// used, so we have a list of the first and last occurrences of each
5404-
// instruction. Next, we transpose this data structure into a multi map that
5405-
// holds the list of intervals that *end* at a specific location. This multi
5406-
// map allows us to perform a linear search. We scan the instructions linearly
5407-
// and record each time that a new interval starts, by placing it in a set.
5408-
// If we find this value in the multi-map then we remove it from the set.
5409-
// The max register usage is the maximum size of the set.
5410-
// We also search for instructions that are defined outside the loop, but are
5411-
// used inside the loop. We need this number separately from the max-interval
5412-
// usage number because when we unroll, loop-invariant values do not take
5413-
// more registers.
5414-
LoopBlocksDFS DFS(TheLoop);
5415-
DFS.perform(LI);
5416-
5417-
RegisterUsage RU;
5418-
5419-
// Each 'key' in the map opens a new interval. The values
5420-
// of the map are the index of the 'last seen' usage of the
5421-
// instruction that is the key.
5422-
using IntervalMap = SmallDenseMap<Instruction *, unsigned, 16>;
5423-
5424-
// Maps instruction to its index.
5425-
SmallVector<Instruction *, 64> IdxToInstr;
5426-
// Marks the end of each interval.
5427-
IntervalMap EndPoint;
5428-
// Saves the list of instruction indices that are used in the loop.
5429-
SmallPtrSet<Instruction *, 8> Ends;
5430-
// Saves the list of values that are used in the loop but are defined outside
5431-
// the loop (not including non-instruction values such as arguments and
5432-
// constants).
5433-
SmallSetVector<Instruction *, 8> LoopInvariants;
5434-
5435-
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5436-
for (Instruction &I : BB->instructionsWithoutDebug()) {
5437-
IdxToInstr.push_back(&I);
5438-
5439-
// Save the end location of each USE.
5440-
for (Value *U : I.operands()) {
5441-
auto *Instr = dyn_cast<Instruction>(U);
5442-
5443-
// Ignore non-instruction values such as arguments, constants, etc.
5444-
// FIXME: Might need some motivation why these values are ignored. If
5445-
// for example an argument is used inside the loop it will increase the
5446-
// register pressure (so shouldn't we add it to LoopInvariants).
5447-
if (!Instr)
5448-
continue;
5449-
5450-
// If this instruction is outside the loop then record it and continue.
5451-
if (!TheLoop->contains(Instr)) {
5452-
LoopInvariants.insert(Instr);
5453-
continue;
5454-
}
5455-
5456-
// Overwrite previous end points.
5457-
EndPoint[Instr] = IdxToInstr.size();
5458-
Ends.insert(Instr);
5459-
}
5460-
}
5461-
}
5462-
5463-
// Saves the list of intervals that end with the index in 'key'.
5464-
using InstrList = SmallVector<Instruction *, 2>;
5465-
SmallDenseMap<unsigned, InstrList, 16> TransposeEnds;
5466-
5467-
// Transpose the EndPoints to a list of values that end at each index.
5468-
for (auto &Interval : EndPoint)
5469-
TransposeEnds[Interval.second].push_back(Interval.first);
5470-
5471-
SmallPtrSet<Instruction *, 8> OpenIntervals;
5472-
SmallVector<RegisterUsage, 8> RUs(VFs.size());
5473-
SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5474-
5475-
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5476-
5477-
const auto &TTICapture = TTI;
5478-
auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5479-
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
5480-
(VF.isScalable() &&
5481-
!TTICapture.isElementTypeLegalForScalableVector(Ty)))
5482-
return 0;
5483-
return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5484-
};
5485-
5486-
collectInLoopReductions();
5487-
5488-
for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) {
5489-
Instruction *I = IdxToInstr[Idx];
5490-
5491-
// Remove all of the instructions that end at this location.
5492-
InstrList &List = TransposeEnds[Idx];
5493-
for (Instruction *ToRemove : List)
5494-
OpenIntervals.erase(ToRemove);
5495-
5496-
// Ignore instructions that are never used within the loop and do not have
5497-
// side-effects.
5498-
if (!Ends.count(I) && !I->mayHaveSideEffects())
5499-
continue;
5500-
5501-
// Skip ignored values.
5502-
if (ValuesToIgnore.count(I))
5503-
continue;
5504-
5505-
// For each VF find the maximum usage of registers.
5506-
for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
5507-
// Count the number of registers used, per register class, given all open
5508-
// intervals.
5509-
// Note that elements in this SmallMapVector will be default constructed
5510-
// as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5511-
// there is no previous entry for ClassID.
5512-
SmallMapVector<unsigned, unsigned, 4> RegUsage;
5513-
5514-
if (VFs[J].isScalar()) {
5515-
for (auto *Inst : OpenIntervals) {
5516-
unsigned ClassID =
5517-
TTI.getRegisterClassForType(false, Inst->getType());
5518-
// FIXME: The target might use more than one register for the type
5519-
// even in the scalar case.
5520-
RegUsage[ClassID] += 1;
5521-
}
5522-
} else {
5523-
collectNonVectorizedAndSetWideningDecisions(VFs[J]);
5524-
for (auto *Inst : OpenIntervals) {
5525-
// Skip ignored values for VF > 1.
5526-
if (VecValuesToIgnore.count(Inst))
5527-
continue;
5528-
if (isScalarAfterVectorization(Inst, VFs[J])) {
5529-
unsigned ClassID =
5530-
TTI.getRegisterClassForType(false, Inst->getType());
5531-
// FIXME: The target might use more than one register for the type
5532-
// even in the scalar case.
5533-
RegUsage[ClassID] += 1;
5534-
} else {
5535-
unsigned ClassID =
5536-
TTI.getRegisterClassForType(true, Inst->getType());
5537-
RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);
5538-
}
5539-
}
5540-
}
5541-
5542-
for (const auto &Pair : RegUsage) {
5543-
auto &Entry = MaxUsages[J][Pair.first];
5544-
Entry = std::max(Entry, Pair.second);
5545-
}
5546-
}
5547-
5548-
LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5549-
<< OpenIntervals.size() << '\n');
5550-
5551-
// Add the current instruction to the list of open intervals.
5552-
OpenIntervals.insert(I);
5553-
}
5554-
5555-
for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5556-
// Note that elements in this SmallMapVector will be default constructed
5557-
// as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5558-
// there is no previous entry for ClassID.
5559-
SmallMapVector<unsigned, unsigned, 4> Invariant;
5560-
5561-
for (auto *Inst : LoopInvariants) {
5562-
// FIXME: The target might use more than one register for the type
5563-
// even in the scalar case.
5564-
bool IsScalar = all_of(Inst->users(), [&](User *U) {
5565-
auto *I = cast<Instruction>(U);
5566-
return TheLoop != LI->getLoopFor(I->getParent()) ||
5567-
isScalarAfterVectorization(I, VFs[Idx]);
5568-
});
5569-
5570-
ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5571-
unsigned ClassID =
5572-
TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5573-
Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5574-
}
5575-
5576-
LLVM_DEBUG({
5577-
dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5578-
dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5579-
<< " item\n";
5580-
for (const auto &pair : MaxUsages[Idx]) {
5581-
dbgs() << "LV(REG): RegisterClass: "
5582-
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
5583-
<< " registers\n";
5584-
}
5585-
dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5586-
<< " item\n";
5587-
for (const auto &pair : Invariant) {
5588-
dbgs() << "LV(REG): RegisterClass: "
5589-
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
5590-
<< " registers\n";
5591-
}
5592-
});
5593-
5594-
RU.LoopInvariantRegs = Invariant;
5595-
RU.MaxLocalUsers = MaxUsages[Idx];
5596-
RUs[Idx] = RU;
5597-
}
5598-
5599-
return RUs;
5600-
}
5601-
56025375
bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
56035376
ElementCount VF) {
56045377
// TODO: Cost model for emulated masked load/store is completely
@@ -7764,7 +7537,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
77647537
}
77657538

77667539
for (auto &P : VPlans) {
7767-
for (ElementCount VF : P->vectorFactors()) {
7540+
SmallVector<ElementCount, 1> VFs(P->vectorFactors());
7541+
auto RUs = ::calculateRegisterUsage(*P, VFs, TTI, CM.ValuesToIgnore);
7542+
for (unsigned I = 0; I < VFs.size(); I++) {
7543+
auto VF = VFs[I];
77687544
if (VF.isScalar())
77697545
continue;
77707546
if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
@@ -7777,12 +7553,23 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
77777553

77787554
InstructionCost Cost = cost(*P, VF);
77797555
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7780-
if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
7781-
BestFactor = CurrentFactor;
7782-
77837556
// If profitable add it to ProfitableVF list.
77847557
if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
77857558
ProfitableVFs.push_back(CurrentFactor);
7559+
7560+
// Make sure that the VF doesn't use more than the number of available
7561+
// registers
7562+
const auto &MLU = RUs[I].MaxLocalUsers;
7563+
if (any_of(MLU, [&](decltype(MLU.front()) &LU) {
7564+
return LU.second > TTI.getNumberOfRegisters(LU.first);
7565+
})) {
7566+
LLVM_DEBUG(dbgs() << "LV(REG): Ignoring VF " << VF
7567+
<< " as it uses too many registers\n");
7568+
continue;
7569+
}
7570+
7571+
if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
7572+
BestFactor = CurrentFactor;
77867573
}
77877574
}
77887575

@@ -7794,6 +7581,30 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
77947581
VectorizationFactor LegacyVF = selectVectorizationFactor();
77957582
VPlan &BestPlan = getPlanFor(BestFactor.Width);
77967583

7584+
// VPlan calculates register pressure from the plan, so it can come to
7585+
// different conclusions than the legacy cost model.
7586+
bool RegUsageDeterminedVF = false;
7587+
if (BestFactor.Width != LegacyVF.Width) {
7588+
SmallVector<ElementCount, 1> LegacyVFs = {LegacyVF.Width};
7589+
SmallVector<ElementCount, 1> VFs = {BestFactor.Width};
7590+
7591+
auto LegacyRUs =
7592+
::calculateRegisterUsage(getPlanFor(LegacyVF.Width), LegacyVFs, TTI, CM.ValuesToIgnore);
7593+
auto RUs = ::calculateRegisterUsage(BestPlan, VFs, TTI, CM.ValuesToIgnore);
7594+
7595+
auto GetMaxUsage = [](
7596+
SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers) {
7597+
unsigned Max = 0;
7598+
for (auto Pair : MaxLocalUsers)
7599+
if (Pair.second > Max)
7600+
Max = Pair.second;
7601+
return Max;
7602+
};
7603+
unsigned MaxLegacyRegUsage = GetMaxUsage(LegacyRUs[0].MaxLocalUsers);
7604+
unsigned MaxRegUsage = GetMaxUsage(RUs[0].MaxLocalUsers);
7605+
RegUsageDeterminedVF = MaxRegUsage <= MaxLegacyRegUsage;
7606+
}
7607+
77977608
// Pre-compute the cost and use it to check if BestPlan contains any
77987609
// simplifications not accounted for in the legacy cost model. If that's the
77997610
// case, don't trigger the assertion, as the extra simplifications may cause a
@@ -7805,6 +7616,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
78057616
// with early exits and plans with additional VPlan simplifications. The
78067617
// legacy cost model doesn't properly model costs for such loops.
78077618
assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7619+
RegUsageDeterminedVF ||
78087620
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
78097621
CostCtx, OrigLoop) ||
78107622
planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -798,9 +798,9 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
798798
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
799799
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
800800
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
801-
; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
801+
; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
802802
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
803-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
803+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP54]], align 1
804804
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
805805
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
806806
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
@@ -840,9 +840,9 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
840840
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP35]], align 1
841841
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP36]], align 1
842842
; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
843-
; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
843+
; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
844844
; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = mul nsw <16 x i32> [[TMP33]], [[TMP37]]
845-
; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul nsw <16 x i32> [[TMP34]], [[TMP56]]
845+
; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul nsw <16 x i32> [[TMP34]], [[TMP38]]
846846
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP39]])
847847
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP40]])
848848
; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
@@ -869,10 +869,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
869869
; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
870870
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX30:%.*]] = add <4 x i32> [[PARTIAL_REDUCE]], [[PARTIAL_REDUCE7]]
871871
; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX30]])
872-
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX31:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]]
873-
; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX31]])
874-
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE1]]
872+
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]]
875873
; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX32]])
874+
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX33:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE1]]
875+
; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX33]])
876876
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
877877
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
878878
; CHECK-INTERLEAVED: scalar.ph:
@@ -946,6 +946,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
946946
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
947947
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
948948
; CHECK-MAXBW: scalar.ph:
949+
;
949950
entry:
950951
br label %for.body
951952

0 commit comments

Comments
 (0)