Skip to content

Commit c5329c8

Browse files
authored
[LV][AArch64] Prefer Fixed over Scalable if cost-model is equal (Neoverse V2) (#95819)
For the Neoverse V2 we would like to prefer fixed width over scalable vectorisation if the cost-model assigns an equal cost to both for certain loops. This improves 7 kernels from TSVC-2 and several production kernels by about 2x, and does not affect SPEC21017 INT and FP. This also adds a new TTI hook that can steer the loop vectorizater to preferring fixed width vectorization, which can be set per CPU. For now, this is only enabled for the Neoverse V2. There are 3 reasons why preferring NEON might be better in the case the cost-model is a tie and the SVE vector size is the same as NEON (128-bit): architectural reasons, micro-architecture reasons, and SVE codegen reasons. The latter will be improved over time, so the more important reasons are the former two. I.e., (micro) architecture reason is the use of LPD/STP instructions which are not available in SVE2 and it avoids predication. For what it is worth: this codegen strategy to generate more NEON is inline with GCC's codegen strategy, which is actually even more aggressive in generating NEON when no predication is required. We could be smarter about the decision making, but this seems to be a first good step in the right direction, and we can always revise this later (for example make the target hook more general).
1 parent 20c6b9f commit c5329c8

File tree

8 files changed

+87
-1
lines changed

8 files changed

+87
-1
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1679,6 +1679,11 @@ class TargetTransformInfo {
16791679
false; ///< If op is an fp min/max, whether NaNs may be present.
16801680
};
16811681

1682+
/// \returns True if the targets prefers fixed width vectorization if the
1683+
/// loop vectorizer's cost-model assigns an equal cost to the fixed and
1684+
/// scalable version of the vectorized loop.
1685+
bool preferFixedOverScalableIfEqualCost() const;
1686+
16821687
/// \returns True if the target prefers reductions in loop.
16831688
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
16841689
ReductionFlags Flags) const;
@@ -2156,6 +2161,7 @@ class TargetTransformInfo::Concept {
21562161
virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
21572162
unsigned ChainSizeInBytes,
21582163
VectorType *VecTy) const = 0;
2164+
virtual bool preferFixedOverScalableIfEqualCost() const = 0;
21592165
virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty,
21602166
ReductionFlags) const = 0;
21612167
virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
@@ -2891,6 +2897,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
28912897
VectorType *VecTy) const override {
28922898
return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
28932899
}
2900+
bool preferFixedOverScalableIfEqualCost() const override {
2901+
return Impl.preferFixedOverScalableIfEqualCost();
2902+
}
28942903
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
28952904
ReductionFlags Flags) const override {
28962905
return Impl.preferInLoopReduction(Opcode, Ty, Flags);

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -920,6 +920,8 @@ class TargetTransformInfoImplBase {
920920
return VF;
921921
}
922922

923+
bool preferFixedOverScalableIfEqualCost() const { return false; }
924+
923925
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
924926
TTI::ReductionFlags Flags) const {
925927
return false;

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1286,6 +1286,10 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF,
12861286
return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
12871287
}
12881288

1289+
bool TargetTransformInfo::preferFixedOverScalableIfEqualCost() const {
1290+
return TTIImpl->preferFixedOverScalableIfEqualCost();
1291+
}
1292+
12891293
bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode, Type *Ty,
12901294
ReductionFlags Flags) const {
12911295
return TTIImpl->preferInLoopReduction(Opcode, Ty, Flags);

llvm/lib/Target/AArch64/AArch64Features.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,10 @@ def FeatureTHE : ExtensionWithMArch<"the", "THE", "FEAT_THE",
355355
// Armv9.0 Architecture Extensions
356356
//===----------------------------------------------------------------------===//
357357

358+
def FeatureUseFixedOverScalableIfEqualCost: SubtargetFeature<"use-fixed-over-scalable-if-equal-cost",
359+
"UseFixedOverScalableIfEqualCost", "true",
360+
"Prefer fixed width loop vectorization over scalable if the cost-model assigns equal costs">;
361+
358362
def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl",
359363
"UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">;
360364

llvm/lib/Target/AArch64/AArch64Processors.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,7 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
525525
FeatureALULSLFast,
526526
FeaturePostRAScheduler,
527527
FeatureEnableSelectOptimize,
528+
FeatureUseFixedOverScalableIfEqualCost,
528529
FeaturePredictableSelectIsExpensive]>;
529530

530531
def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3",

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,10 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
371371
return TailFoldingStyle::DataWithoutLaneMask;
372372
}
373373

374+
bool preferFixedOverScalableIfEqualCost() const {
375+
return ST->useFixedOverScalableIfEqualCost();
376+
}
377+
374378
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI);
375379

376380
bool supportsScalableVectors() const {

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4630,7 +4630,9 @@ bool LoopVectorizationPlanner::isMoreProfitable(
46304630
// Assume vscale may be larger than 1 (or the value being tuned for),
46314631
// so that scalable vectorization is slightly favorable over fixed-width
46324632
// vectorization.
4633-
bool PreferScalable = A.Width.isScalable() && !B.Width.isScalable();
4633+
bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4634+
A.Width.isScalable() && !B.Width.isScalable();
4635+
46344636
auto CmpFn = [PreferScalable](const InstructionCost &LHS,
46354637
const InstructionCost &RHS) {
46364638
return PreferScalable ? LHS <= RHS : LHS < RHS;
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
; RUN: opt -S < %s -passes=loop-vectorize -force-target-instruction-cost=1 | FileCheck %s
2+
3+
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
4+
target triple = "aarch64-unknown-linux-gnu"
5+
6+
@a = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 64
7+
@b = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 64
8+
9+
define void @NeoverseV2() #0 {
10+
; CHECK-LABEL: define void @NeoverseV2(
11+
; CHECK: store <4 x float>
12+
;
13+
entry:
14+
br label %for.body
15+
16+
for.cond.cleanup:
17+
ret void
18+
19+
for.body:
20+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
21+
%arrayidx = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv
22+
%0 = load float, ptr %arrayidx, align 4
23+
%arrayidx2 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv
24+
%1 = load float, ptr %arrayidx2, align 4
25+
%add = fadd fast float %1, %0
26+
%2 = add nuw nsw i64 %indvars.iv, 16000
27+
%arrayidx5 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %2
28+
store float %add, ptr %arrayidx5, align 4
29+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
30+
%exitcond.not = icmp eq i64 %indvars.iv.next, 16000
31+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
32+
}
33+
34+
define void @GenericCPU() #1 {
35+
; CHECK-LABEL: define void @GenericCPU(
36+
; CHECK: store <vscale x 4 x float>
37+
;
38+
entry:
39+
br label %for.body
40+
41+
for.cond.cleanup:
42+
ret void
43+
44+
for.body:
45+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
46+
%arrayidx = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv
47+
%0 = load float, ptr %arrayidx, align 4
48+
%arrayidx2 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv
49+
%1 = load float, ptr %arrayidx2, align 4
50+
%add = fadd fast float %1, %0
51+
%2 = add nuw nsw i64 %indvars.iv, 16000
52+
%arrayidx5 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %2
53+
store float %add, ptr %arrayidx5, align 4
54+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
55+
%exitcond.not = icmp eq i64 %indvars.iv.next, 16000
56+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
57+
}
58+
59+
attributes #0 = { vscale_range(1,16) "target-cpu"="neoverse-v2" "target-features"="+sve,+sve2,+v9a" }
60+
attributes #1 = { vscale_range(1,16) "target-cpu"="generic" "target-features"="+sve,+v9a" }

0 commit comments

Comments
 (0)