Skip to content

Commit fc7e74e

Browse files
committed
[CostModel][X86] getCastInstrCost - improve CostKind adjustment when splitting src/dst types
Noticed in #90883 review - for non-Throughput costs, we weren't applying the split count to '0 or 1' cost value. This still doesn't work well as many of the type legalizations are hidden so we don't have the split count, really we need to move a CostKindCosts based costs table, but that's going to be a lot of work :/
1 parent bcdbd0b commit fc7e74e

17 files changed

+3584
-1993
lines changed

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2121,10 +2121,11 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
21212121
assert(ISD && "Invalid opcode");
21222122

21232123
// TODO: Allow non-throughput costs that aren't binary.
2124-
auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2124+
auto AdjustCost = [&CostKind](InstructionCost Cost,
2125+
InstructionCost N = 1) -> InstructionCost {
21252126
if (CostKind != TTI::TCK_RecipThroughput)
2126-
return Cost == 0 ? 0 : 1;
2127-
return Cost;
2127+
return Cost == 0 ? 0 : N;
2128+
return Cost * N;
21282129
};
21292130

21302131
// The cost tables include both specific, custom (non-legal) src/dst type
@@ -3004,53 +3005,53 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
30043005
if (ST->hasBWI())
30053006
if (const auto *Entry = ConvertCostTableLookup(
30063007
AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3007-
return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3008+
return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
30083009

30093010
if (ST->hasDQI())
30103011
if (const auto *Entry = ConvertCostTableLookup(
30113012
AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3012-
return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3013+
return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
30133014

30143015
if (ST->hasAVX512())
30153016
if (const auto *Entry = ConvertCostTableLookup(
30163017
AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3017-
return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3018+
return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
30183019
}
30193020

30203021
if (ST->hasBWI())
30213022
if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
30223023
LTDest.second, LTSrc.second))
3023-
return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3024+
return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
30243025

30253026
if (ST->hasDQI())
30263027
if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
30273028
LTDest.second, LTSrc.second))
3028-
return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3029+
return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
30293030

30303031
if (ST->hasAVX512())
30313032
if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
30323033
LTDest.second, LTSrc.second))
3033-
return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3034+
return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
30343035

30353036
if (ST->hasAVX2())
30363037
if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
30373038
LTDest.second, LTSrc.second))
3038-
return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3039+
return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
30393040

30403041
if (ST->hasAVX())
30413042
if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
30423043
LTDest.second, LTSrc.second))
3043-
return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3044+
return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
30443045

30453046
if (ST->hasSSE41())
30463047
if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
30473048
LTDest.second, LTSrc.second))
3048-
return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3049+
return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
30493050

30503051
if (ST->hasSSE2())
30513052
if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
30523053
LTDest.second, LTSrc.second))
3053-
return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3054+
return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
30543055

30553056
// Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
30563057
// sitofp.

llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,17 +56,17 @@ define void @umul(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
5656
;
5757
; LATE-LABEL: 'umul'
5858
; LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
59-
; LATE-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
59+
; LATE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
6060
; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
6161
;
6262
; SIZE-LABEL: 'umul'
6363
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
64-
; SIZE-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
64+
; SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
6565
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
6666
;
6767
; SIZE_LATE-LABEL: 'umul'
6868
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
69-
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
69+
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
7070
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
7171
;
7272
%s = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)

llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1883,13 +1883,13 @@ define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)
18831883

18841884
define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
18851885
; SSE2-LABEL: 'test_gather_16f32_const_mask'
1886-
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
1886+
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
18871887
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
18881888
; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
18891889
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
18901890
;
18911891
; SSE42-LABEL: 'test_gather_16f32_const_mask'
1892-
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
1892+
; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
18931893
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
18941894
; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
18951895
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -1927,13 +1927,13 @@ define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
19271927

19281928
define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) {
19291929
; SSE2-LABEL: 'test_gather_16f32_var_mask'
1930-
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
1930+
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
19311931
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
19321932
; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
19331933
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
19341934
;
19351935
; SSE42-LABEL: 'test_gather_16f32_var_mask'
1936-
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
1936+
; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
19371937
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
19381938
; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
19391939
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -1971,13 +1971,13 @@ define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16
19711971

19721972
define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
19731973
; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
1974-
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
1974+
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
19751975
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
19761976
; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
19771977
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
19781978
;
19791979
; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
1980-
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
1980+
; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
19811981
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
19821982
; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
19831983
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -2017,15 +2017,15 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
20172017
; SSE2-LABEL: 'test_gather_16f32_const_mask2'
20182018
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
20192019
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
2020-
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
2020+
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
20212021
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
20222022
; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
20232023
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
20242024
;
20252025
; SSE42-LABEL: 'test_gather_16f32_const_mask2'
20262026
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
20272027
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
2028-
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
2028+
; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
20292029
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
20302030
; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
20312031
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -2178,13 +2178,13 @@ define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
21782178

21792179
define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
21802180
; SSE2-LABEL: 'test_gather_4f32'
2181-
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
2181+
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
21822182
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
21832183
; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
21842184
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
21852185
;
21862186
; SSE42-LABEL: 'test_gather_4f32'
2187-
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
2187+
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
21882188
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
21892189
; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
21902190
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
@@ -2228,13 +2228,13 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
22282228

22292229
define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
22302230
; SSE2-LABEL: 'test_gather_4f32_const_mask'
2231-
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
2231+
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
22322232
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
22332233
; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
22342234
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
22352235
;
22362236
; SSE42-LABEL: 'test_gather_4f32_const_mask'
2237-
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
2237+
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
22382238
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
22392239
; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
22402240
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res

0 commit comments

Comments
 (0)