Skip to content

Commit 38c92c1

Browse files
overmightydzhidzhoev
authored andcommitted
[AArch64] Add patterns for FMADD, FMSUB
FMADD, FMSUB instructions perform better or the same compared to indexed FMLA, FMLS. For example, the Arm Cortex-A55 Software Optimization Guide lists "FP multiply accumulate" FMADD, FMSUB instructions with a throughput of 2 IPC, whereas it lists "ASIMD FP multiply accumulate, by element" FMLA, FMLS with a throughput of 1 IPC. The Arm Cortex-A77 Software Optimization Guide, however, does not separately list "by element" variants of the "ASIMD FP multiply accumulate" instructions, which are listed with the same throughput of 2 IPC as "FP multiply accumulate" instructions. Reviewed By: samtebbs, dzhidzhoev Differential Revision: https://reviews.llvm.org/D158008
1 parent 0563725 commit 38c92c1

File tree

5 files changed

+470
-83
lines changed

5 files changed

+470
-83
lines changed

clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem-constrained.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
103103
// COMMONIR: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
104104
// UNCONSTRAINED: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
105105
// CONSTRAINED: [[TMP6:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict")
106-
// CHECK-ASM: fmla d{{[0-9]+}}, d{{[0-9]+}}, v{{[0-9]+}}.d[{{[0-9]+}}]
106+
// CHECK-ASM: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
107107
// COMMONIR: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
108108
// COMMONIR: ret <1 x double> [[TMP7]]
109109
float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
@@ -122,7 +122,7 @@ float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
122122
// COMMONIR: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
123123
// UNCONSTRAINED: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
124124
// CONSTRAINED: [[TMP6:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict")
125-
// CHECK-ASM: fmla d{{[0-9]+}}, d{{[0-9]+}}, v{{[0-9]+}}.d[{{[0-9]+}}]
125+
// CHECK-ASM: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
126126
// COMMONIR: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
127127
// COMMONIR: ret <1 x double> [[TMP7]]
128128
float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {

llvm/lib/Target/AArch64/AArch64InstrFormats.td

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5409,6 +5409,44 @@ multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
54095409
(node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
54105410
let Inst{23-22} = 0b01; // 64-bit size flag
54115411
}
5412+
5413+
let Predicates = [HasFullFP16] in {
5414+
def : Pat<(f16 (node (f16 FPR16:$Rn),
5415+
(f16 (extractelt (v8f16 V128:$Rm), (i64 0))),
5416+
(f16 FPR16:$Ra))),
5417+
(!cast<Instruction>(NAME # Hrrr)
5418+
FPR16:$Rn, (f16 (EXTRACT_SUBREG V128:$Rm, hsub)), FPR16:$Ra)>;
5419+
5420+
def : Pat<(f16 (node (f16 (extractelt (v8f16 V128:$Rn), (i64 0))),
5421+
(f16 FPR16:$Rm),
5422+
(f16 FPR16:$Ra))),
5423+
(!cast<Instruction>(NAME # Hrrr)
5424+
(f16 (EXTRACT_SUBREG V128:$Rn, hsub)), FPR16:$Rm, FPR16:$Ra)>;
5425+
}
5426+
5427+
def : Pat<(f32 (node (f32 FPR32:$Rn),
5428+
(f32 (extractelt (v4f32 V128:$Rm), (i64 0))),
5429+
(f32 FPR32:$Ra))),
5430+
(!cast<Instruction>(NAME # Srrr)
5431+
FPR32:$Rn, (EXTRACT_SUBREG V128:$Rm, ssub), FPR32:$Ra)>;
5432+
5433+
def : Pat<(f32 (node (f32 (extractelt (v4f32 V128:$Rn), (i64 0))),
5434+
(f32 FPR32:$Rm),
5435+
(f32 FPR32:$Ra))),
5436+
(!cast<Instruction>(NAME # Srrr)
5437+
(EXTRACT_SUBREG V128:$Rn, ssub), FPR32:$Rm, FPR32:$Ra)>;
5438+
5439+
def : Pat<(f64 (node (f64 FPR64:$Rn),
5440+
(f64 (extractelt (v2f64 V128:$Rm), (i64 0))),
5441+
(f64 FPR64:$Ra))),
5442+
(!cast<Instruction>(NAME # Drrr)
5443+
FPR64:$Rn, (EXTRACT_SUBREG V128:$Rm, dsub), FPR64:$Ra)>;
5444+
5445+
def : Pat<(f64 (node (f64 (extractelt (v2f64 V128:$Rn), (i64 0))),
5446+
(f64 FPR64:$Rm),
5447+
(f64 FPR64:$Ra))),
5448+
(!cast<Instruction>(NAME # Drrr)
5449+
(EXTRACT_SUBREG V128:$Rn, dsub), FPR64:$Rm, FPR64:$Ra)>;
54125450
}
54135451

54145452
//---

llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,15 @@ target triple = "aarch64"
77
define <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) {
88
; CHECK-LABEL: complex_mul_v2f16:
99
; CHECK: // %bb.0: // %entry
10-
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1110
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
12-
; CHECK-NEXT: mov h3, v0.h[1]
13-
; CHECK-NEXT: mov h2, v1.h[1]
14-
; CHECK-NEXT: fmul h4, h0, v1.h[1]
15-
; CHECK-NEXT: fnmul h2, h3, h2
16-
; CHECK-NEXT: fmla h4, h3, v1.h[0]
17-
; CHECK-NEXT: fmla h2, h0, v1.h[0]
18-
; CHECK-NEXT: mov v2.h[1], v4.h[0]
19-
; CHECK-NEXT: fmov d0, d2
11+
; CHECK-NEXT: mov h2, v0.h[1]
12+
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
13+
; CHECK-NEXT: fmul h3, h0, v1.h[1]
14+
; CHECK-NEXT: fmul h4, h2, v1.h[1]
15+
; CHECK-NEXT: fmadd h2, h1, h2, h3
16+
; CHECK-NEXT: fnmsub h0, h1, h0, h4
17+
; CHECK-NEXT: mov v0.h[1], v2.h[0]
18+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
2019
; CHECK-NEXT: ret
2120
entry:
2221
%a.real = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> <i32 0>

llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll

Lines changed: 108 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -80,29 +80,75 @@ entry:
8080
ret <8 x half> %0
8181
}
8282

83-
define dso_local half @t_vfmah_lane_f16(half %a, half %b, <4 x half> %c, i32 %lane) {
84-
; CHECK-LABEL: t_vfmah_lane_f16:
83+
define dso_local half @t_vfmah_lane_f16_0(half %a, half %b, <4 x half> %c, i32 %lane) {
84+
; CHECK-LABEL: t_vfmah_lane_f16_0:
8585
; CHECK: // %bb.0: // %entry
8686
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
87-
; CHECK-NEXT: fmla h0, h1, v2.h[0]
87+
; CHECK-NEXT: fmadd h0, h1, h2, h0
8888
; CHECK-NEXT: ret
8989
entry:
9090
%extract = extractelement <4 x half> %c, i32 0
9191
%0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
9292
ret half %0
9393
}
9494

95-
define dso_local half @t_vfmah_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) {
96-
; CHECK-LABEL: t_vfmah_laneq_f16:
95+
define dso_local half @t_vfmah_lane_f16_0_swap(half %a, half %b, <4 x half> %c, i32 %lane) {
96+
; CHECK-LABEL: t_vfmah_lane_f16_0_swap:
9797
; CHECK: // %bb.0: // %entry
98-
; CHECK-NEXT: fmla h0, h1, v2.h[0]
98+
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
99+
; CHECK-NEXT: fmadd h0, h2, h1, h0
100+
; CHECK-NEXT: ret
101+
entry:
102+
%extract = extractelement <4 x half> %c, i32 0
103+
%0 = tail call half @llvm.fma.f16(half %extract, half %b, half %a)
104+
ret half %0
105+
}
106+
107+
define dso_local half @t_vfmah_lane_f16_3(half %a, half %b, <4 x half> %c, i32 %lane) {
108+
; CHECK-LABEL: t_vfmah_lane_f16_3:
109+
; CHECK: // %bb.0: // %entry
110+
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
111+
; CHECK-NEXT: fmla h0, h1, v2.h[3]
112+
; CHECK-NEXT: ret
113+
entry:
114+
%extract = extractelement <4 x half> %c, i32 3
115+
%0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
116+
ret half %0
117+
}
118+
119+
define dso_local half @t_vfmah_laneq_f16_0(half %a, half %b, <8 x half> %c, i32 %lane) {
120+
; CHECK-LABEL: t_vfmah_laneq_f16_0:
121+
; CHECK: // %bb.0: // %entry
122+
; CHECK-NEXT: fmadd h0, h1, h2, h0
99123
; CHECK-NEXT: ret
100124
entry:
101125
%extract = extractelement <8 x half> %c, i32 0
102126
%0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
103127
ret half %0
104128
}
105129

130+
define dso_local half @t_vfmah_laneq_f16_0_swap(half %a, half %b, <8 x half> %c, i32 %lane) {
131+
; CHECK-LABEL: t_vfmah_laneq_f16_0_swap:
132+
; CHECK: // %bb.0: // %entry
133+
; CHECK-NEXT: fmadd h0, h2, h1, h0
134+
; CHECK-NEXT: ret
135+
entry:
136+
%extract = extractelement <8 x half> %c, i32 0
137+
%0 = tail call half @llvm.fma.f16(half %extract, half %b, half %a)
138+
ret half %0
139+
}
140+
141+
define dso_local half @t_vfmah_laneq_f16_7(half %a, half %b, <8 x half> %c, i32 %lane) {
142+
; CHECK-LABEL: t_vfmah_laneq_f16_7:
143+
; CHECK: // %bb.0: // %entry
144+
; CHECK-NEXT: fmla h0, h1, v2.h[7]
145+
; CHECK-NEXT: ret
146+
entry:
147+
%extract = extractelement <8 x half> %c, i32 7
148+
%0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
149+
ret half %0
150+
}
151+
106152
define dso_local <4 x half> @t_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
107153
; CHECK-LABEL: t_vfms_lane_f16:
108154
; CHECK: // %bb.0: // %entry
@@ -181,23 +227,49 @@ entry:
181227
ret <8 x half> %0
182228
}
183229

184-
define dso_local half @t_vfmsh_lane_f16(half %a, half %b, <4 x half> %c, i32 %lane) {
185-
; CHECK-LABEL: t_vfmsh_lane_f16:
230+
define dso_local half @t_vfmsh_lane_f16_0(half %a, half %b, <4 x half> %c, i32 %lane) {
231+
; CHECK-LABEL: t_vfmsh_lane_f16_0:
232+
; CHECK: // %bb.0: // %entry
233+
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
234+
; CHECK-NEXT: fmsub h0, h2, h1, h0
235+
; CHECK-NEXT: ret
236+
entry:
237+
%0 = fsub half 0xH8000, %b
238+
%extract = extractelement <4 x half> %c, i32 0
239+
%1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
240+
ret half %1
241+
}
242+
243+
define dso_local half @t_vfmsh_lane_f16_0_swap(half %a, half %b, <4 x half> %c, i32 %lane) {
244+
; CHECK-LABEL: t_vfmsh_lane_f16_0_swap:
186245
; CHECK: // %bb.0: // %entry
187246
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
188-
; CHECK-NEXT: fmls h0, h1, v2.h[0]
247+
; CHECK-NEXT: fmsub h0, h2, h1, h0
189248
; CHECK-NEXT: ret
190249
entry:
191250
%0 = fsub half 0xH8000, %b
192251
%extract = extractelement <4 x half> %c, i32 0
252+
%1 = tail call half @llvm.fma.f16(half %extract, half %0, half %a)
253+
ret half %1
254+
}
255+
256+
define dso_local half @t_vfmsh_lane_f16_3(half %a, half %b, <4 x half> %c, i32 %lane) {
257+
; CHECK-LABEL: t_vfmsh_lane_f16_3:
258+
; CHECK: // %bb.0: // %entry
259+
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
260+
; CHECK-NEXT: fmls h0, h1, v2.h[3]
261+
; CHECK-NEXT: ret
262+
entry:
263+
%0 = fsub half 0xH8000, %b
264+
%extract = extractelement <4 x half> %c, i32 3
193265
%1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
194266
ret half %1
195267
}
196268

197-
define dso_local half @t_vfmsh_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) {
198-
; CHECK-LABEL: t_vfmsh_laneq_f16:
269+
define dso_local half @t_vfmsh_laneq_f16_0(half %a, half %b, <8 x half> %c, i32 %lane) {
270+
; CHECK-LABEL: t_vfmsh_laneq_f16_0:
199271
; CHECK: // %bb.0: // %entry
200-
; CHECK-NEXT: fmls h0, h1, v2.h[0]
272+
; CHECK-NEXT: fmsub h0, h2, h1, h0
201273
; CHECK-NEXT: ret
202274
entry:
203275
%0 = fsub half 0xH8000, %b
@@ -206,6 +278,30 @@ entry:
206278
ret half %1
207279
}
208280

281+
define dso_local half @t_vfmsh_laneq_f16_0_swap(half %a, half %b, <8 x half> %c, i32 %lane) {
282+
; CHECK-LABEL: t_vfmsh_laneq_f16_0_swap:
283+
; CHECK: // %bb.0: // %entry
284+
; CHECK-NEXT: fmsub h0, h2, h1, h0
285+
; CHECK-NEXT: ret
286+
entry:
287+
%0 = fsub half 0xH8000, %b
288+
%extract = extractelement <8 x half> %c, i32 0
289+
%1 = tail call half @llvm.fma.f16(half %extract, half %0, half %a)
290+
ret half %1
291+
}
292+
293+
define dso_local half @t_vfmsh_laneq_f16_7(half %a, half %b, <8 x half> %c, i32 %lane) {
294+
; CHECK-LABEL: t_vfmsh_laneq_f16_7:
295+
; CHECK: // %bb.0: // %entry
296+
; CHECK-NEXT: fmls h0, h1, v2.h[7]
297+
; CHECK-NEXT: ret
298+
entry:
299+
%0 = fsub half 0xH8000, %b
300+
%extract = extractelement <8 x half> %c, i32 7
301+
%1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
302+
ret half %1
303+
}
304+
209305
define dso_local <4 x half> @t_vmul_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
210306
; CHECK-LABEL: t_vmul_laneq_f16:
211307
; CHECK: // %bb.0: // %entry

0 commit comments

Comments
 (0)