Skip to content

Commit 1dd0d3c

Browse files
authored
[AArch64][GISel] Fold COPY(y:gpr, DUP(x:fpr, i)) -> UMOV(y:gpr, x:fpr, i) (#89017)
This patch adds a peephole to AArch64PostSelectOptimize for codegen that is caused by RegBankSelect limiting G_EXTRACT_VECTOR_ELT only to FPR registers in both the input and output registers. This can cause a generation of COPY from FPR to GPR when, for example, the output register of the G_EXTRACT_VECTOR_ELT is used in a branch condition. This was noticed when looking at codegen differences between SDAG and GI for the s1279 kernel in the TSVC benchmark.
1 parent 3a4c1b9 commit 1dd0d3c

File tree

10 files changed

+238
-262
lines changed

10 files changed

+238
-262
lines changed

llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ class AArch64PostSelectOptimize : public MachineFunctionPass {
4848
bool doPeepholeOpts(MachineBasicBlock &MBB);
4949
/// Look for cross regclass copies that can be trivially eliminated.
5050
bool foldSimpleCrossClassCopies(MachineInstr &MI);
51+
bool foldCopyDup(MachineInstr &MI);
5152
};
5253
} // end anonymous namespace
5354

@@ -105,7 +106,10 @@ unsigned getNonFlagSettingVariant(unsigned Opc) {
105106
bool AArch64PostSelectOptimize::doPeepholeOpts(MachineBasicBlock &MBB) {
106107
bool Changed = false;
107108
for (auto &MI : make_early_inc_range(make_range(MBB.begin(), MBB.end()))) {
108-
Changed |= foldSimpleCrossClassCopies(MI);
109+
bool CurrentIterChanged = foldSimpleCrossClassCopies(MI);
110+
if (!CurrentIterChanged)
111+
CurrentIterChanged |= foldCopyDup(MI);
112+
Changed |= CurrentIterChanged;
109113
}
110114
return Changed;
111115
}
@@ -158,6 +162,68 @@ bool AArch64PostSelectOptimize::foldSimpleCrossClassCopies(MachineInstr &MI) {
158162
return true;
159163
}
160164

165+
bool AArch64PostSelectOptimize::foldCopyDup(MachineInstr &MI) {
166+
if (!MI.isCopy())
167+
return false;
168+
169+
auto *MF = MI.getMF();
170+
auto &MRI = MF->getRegInfo();
171+
auto *TII = MF->getSubtarget().getInstrInfo();
172+
173+
// Optimize COPY(y:GPR, DUP(x:FPR, i)) -> UMOV(y:GPR, x:FPR, i).
174+
// Here Dst is y and Src is the result of DUP.
175+
Register Dst = MI.getOperand(0).getReg();
176+
Register Src = MI.getOperand(1).getReg();
177+
178+
if (!Dst.isVirtual() || !Src.isVirtual())
179+
return false;
180+
181+
auto TryMatchDUP = [&](const TargetRegisterClass *GPRRegClass,
182+
const TargetRegisterClass *FPRRegClass, unsigned DUP,
183+
unsigned UMOV) {
184+
if (MRI.getRegClassOrNull(Dst) != GPRRegClass ||
185+
MRI.getRegClassOrNull(Src) != FPRRegClass)
186+
return false;
187+
188+
// There is a special case when one of the uses is COPY(z:FPR, y:GPR).
189+
// In this case, we get COPY(z:FPR, COPY(y:GPR, DUP(x:FPR, i))), which can
190+
// be folded by peephole-opt into just DUP(z:FPR, i), so this transform is
191+
// not worthwhile in that case.
192+
for (auto &Use : MRI.use_nodbg_instructions(Dst)) {
193+
if (!Use.isCopy())
194+
continue;
195+
196+
Register UseOp0 = Use.getOperand(0).getReg();
197+
Register UseOp1 = Use.getOperand(1).getReg();
198+
if (UseOp0.isPhysical() || UseOp1.isPhysical())
199+
return false;
200+
201+
if (MRI.getRegClassOrNull(UseOp0) == FPRRegClass &&
202+
MRI.getRegClassOrNull(UseOp1) == GPRRegClass)
203+
return false;
204+
}
205+
206+
MachineInstr *SrcMI = MRI.getUniqueVRegDef(Src);
207+
if (!SrcMI || SrcMI->getOpcode() != DUP || !MRI.hasOneNonDBGUse(Src))
208+
return false;
209+
210+
Register DupSrc = SrcMI->getOperand(1).getReg();
211+
int64_t DupImm = SrcMI->getOperand(2).getImm();
212+
213+
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(UMOV), Dst)
214+
.addReg(DupSrc)
215+
.addImm(DupImm);
216+
SrcMI->eraseFromParent();
217+
MI.eraseFromParent();
218+
return true;
219+
};
220+
221+
return TryMatchDUP(&AArch64::GPR32RegClass, &AArch64::FPR32RegClass,
222+
AArch64::DUPi32, AArch64::UMOVvi32) ||
223+
TryMatchDUP(&AArch64::GPR64RegClass, &AArch64::FPR64RegClass,
224+
AArch64::DUPi64, AArch64::UMOVvi64);
225+
}
226+
161227
bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) {
162228
// If we find a dead NZCV implicit-def, we
163229
// - try to convert the operation to a non-flag-setting equivalent

llvm/test/CodeGen/AArch64/aarch64-mulv.ll

Lines changed: 39 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -25,22 +25,13 @@ declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
2525
declare i128 @llvm.vector.reduce.mul.v2i128(<2 x i128>)
2626

2727
define i8 @mulv_v2i8(<2 x i8> %a) {
28-
; CHECK-SD-LABEL: mulv_v2i8:
29-
; CHECK-SD: // %bb.0: // %entry
30-
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
31-
; CHECK-SD-NEXT: mov w8, v0.s[1]
32-
; CHECK-SD-NEXT: fmov w9, s0
33-
; CHECK-SD-NEXT: mul w0, w9, w8
34-
; CHECK-SD-NEXT: ret
35-
;
36-
; CHECK-GI-LABEL: mulv_v2i8:
37-
; CHECK-GI: // %bb.0: // %entry
38-
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
39-
; CHECK-GI-NEXT: mov s1, v0.s[1]
40-
; CHECK-GI-NEXT: fmov w8, s0
41-
; CHECK-GI-NEXT: fmov w9, s1
42-
; CHECK-GI-NEXT: mul w0, w8, w9
43-
; CHECK-GI-NEXT: ret
28+
; CHECK-LABEL: mulv_v2i8:
29+
; CHECK: // %bb.0: // %entry
30+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
31+
; CHECK-NEXT: mov w8, v0.s[1]
32+
; CHECK-NEXT: fmov w9, s0
33+
; CHECK-NEXT: mul w0, w9, w8
34+
; CHECK-NEXT: ret
4435
entry:
4536
%arg1 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> %a)
4637
ret i8 %arg1
@@ -230,22 +221,13 @@ entry:
230221
}
231222

232223
define i16 @mulv_v2i16(<2 x i16> %a) {
233-
; CHECK-SD-LABEL: mulv_v2i16:
234-
; CHECK-SD: // %bb.0: // %entry
235-
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
236-
; CHECK-SD-NEXT: mov w8, v0.s[1]
237-
; CHECK-SD-NEXT: fmov w9, s0
238-
; CHECK-SD-NEXT: mul w0, w9, w8
239-
; CHECK-SD-NEXT: ret
240-
;
241-
; CHECK-GI-LABEL: mulv_v2i16:
242-
; CHECK-GI: // %bb.0: // %entry
243-
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
244-
; CHECK-GI-NEXT: mov s1, v0.s[1]
245-
; CHECK-GI-NEXT: fmov w8, s0
246-
; CHECK-GI-NEXT: fmov w9, s1
247-
; CHECK-GI-NEXT: mul w0, w8, w9
248-
; CHECK-GI-NEXT: ret
224+
; CHECK-LABEL: mulv_v2i16:
225+
; CHECK: // %bb.0: // %entry
226+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
227+
; CHECK-NEXT: mov w8, v0.s[1]
228+
; CHECK-NEXT: fmov w9, s0
229+
; CHECK-NEXT: mul w0, w9, w8
230+
; CHECK-NEXT: ret
249231
entry:
250232
%arg1 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> %a)
251233
ret i16 %arg1
@@ -372,22 +354,13 @@ entry:
372354
}
373355

374356
define i32 @mulv_v2i32(<2 x i32> %a) {
375-
; CHECK-SD-LABEL: mulv_v2i32:
376-
; CHECK-SD: // %bb.0: // %entry
377-
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
378-
; CHECK-SD-NEXT: mov w8, v0.s[1]
379-
; CHECK-SD-NEXT: fmov w9, s0
380-
; CHECK-SD-NEXT: mul w0, w9, w8
381-
; CHECK-SD-NEXT: ret
382-
;
383-
; CHECK-GI-LABEL: mulv_v2i32:
384-
; CHECK-GI: // %bb.0: // %entry
385-
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
386-
; CHECK-GI-NEXT: mov s1, v0.s[1]
387-
; CHECK-GI-NEXT: fmov w8, s0
388-
; CHECK-GI-NEXT: fmov w9, s1
389-
; CHECK-GI-NEXT: mul w0, w8, w9
390-
; CHECK-GI-NEXT: ret
357+
; CHECK-LABEL: mulv_v2i32:
358+
; CHECK: // %bb.0: // %entry
359+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
360+
; CHECK-NEXT: mov w8, v0.s[1]
361+
; CHECK-NEXT: fmov w9, s0
362+
; CHECK-NEXT: mul w0, w9, w8
363+
; CHECK-NEXT: ret
391364
entry:
392365
%arg1 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %a)
393366
ret i32 %arg1
@@ -424,10 +397,9 @@ define i32 @mulv_v4i32(<4 x i32> %a) {
424397
; CHECK-GI: // %bb.0: // %entry
425398
; CHECK-GI-NEXT: mov d1, v0.d[1]
426399
; CHECK-GI-NEXT: mul v0.2s, v0.2s, v1.2s
427-
; CHECK-GI-NEXT: mov s1, v0.s[1]
428-
; CHECK-GI-NEXT: fmov w8, s0
429-
; CHECK-GI-NEXT: fmov w9, s1
430-
; CHECK-GI-NEXT: mul w0, w8, w9
400+
; CHECK-GI-NEXT: mov w8, v0.s[1]
401+
; CHECK-GI-NEXT: fmov w9, s0
402+
; CHECK-GI-NEXT: mul w0, w9, w8
431403
; CHECK-GI-NEXT: ret
432404
entry:
433405
%arg1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a)
@@ -452,31 +424,22 @@ define i32 @mulv_v8i32(<8 x i32> %a) {
452424
; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s
453425
; CHECK-GI-NEXT: mul v1.2s, v1.2s, v3.2s
454426
; CHECK-GI-NEXT: mul v0.2s, v0.2s, v1.2s
455-
; CHECK-GI-NEXT: mov s1, v0.s[1]
456-
; CHECK-GI-NEXT: fmov w8, s0
457-
; CHECK-GI-NEXT: fmov w9, s1
458-
; CHECK-GI-NEXT: mul w0, w8, w9
427+
; CHECK-GI-NEXT: mov w8, v0.s[1]
428+
; CHECK-GI-NEXT: fmov w9, s0
429+
; CHECK-GI-NEXT: mul w0, w9, w8
459430
; CHECK-GI-NEXT: ret
460431
entry:
461432
%arg1 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a)
462433
ret i32 %arg1
463434
}
464435

465436
define i64 @mulv_v2i64(<2 x i64> %a) {
466-
; CHECK-SD-LABEL: mulv_v2i64:
467-
; CHECK-SD: // %bb.0: // %entry
468-
; CHECK-SD-NEXT: mov x8, v0.d[1]
469-
; CHECK-SD-NEXT: fmov x9, d0
470-
; CHECK-SD-NEXT: mul x0, x9, x8
471-
; CHECK-SD-NEXT: ret
472-
;
473-
; CHECK-GI-LABEL: mulv_v2i64:
474-
; CHECK-GI: // %bb.0: // %entry
475-
; CHECK-GI-NEXT: mov d1, v0.d[1]
476-
; CHECK-GI-NEXT: fmov x8, d0
477-
; CHECK-GI-NEXT: fmov x9, d1
478-
; CHECK-GI-NEXT: mul x0, x8, x9
479-
; CHECK-GI-NEXT: ret
437+
; CHECK-LABEL: mulv_v2i64:
438+
; CHECK: // %bb.0: // %entry
439+
; CHECK-NEXT: mov x8, v0.d[1]
440+
; CHECK-NEXT: fmov x9, d0
441+
; CHECK-NEXT: mul x0, x9, x8
442+
; CHECK-NEXT: ret
480443
entry:
481444
%arg1 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %a)
482445
ret i64 %arg1
@@ -522,14 +485,12 @@ define i64 @mulv_v4i64(<4 x i64> %a) {
522485
;
523486
; CHECK-GI-LABEL: mulv_v4i64:
524487
; CHECK-GI: // %bb.0: // %entry
525-
; CHECK-GI-NEXT: mov d2, v0.d[1]
526-
; CHECK-GI-NEXT: mov d3, v1.d[1]
527-
; CHECK-GI-NEXT: fmov x8, d0
528-
; CHECK-GI-NEXT: fmov x9, d2
529-
; CHECK-GI-NEXT: fmov x10, d3
530-
; CHECK-GI-NEXT: mul x8, x8, x9
531-
; CHECK-GI-NEXT: fmov x9, d1
532-
; CHECK-GI-NEXT: mul x9, x9, x10
488+
; CHECK-GI-NEXT: mov x8, v0.d[1]
489+
; CHECK-GI-NEXT: fmov x10, d0
490+
; CHECK-GI-NEXT: mov x9, v1.d[1]
491+
; CHECK-GI-NEXT: mul x8, x10, x8
492+
; CHECK-GI-NEXT: fmov x10, d1
493+
; CHECK-GI-NEXT: mul x9, x10, x9
533494
; CHECK-GI-NEXT: mul x0, x8, x9
534495
; CHECK-GI-NEXT: ret
535496
entry:

0 commit comments

Comments
 (0)