Skip to content

Commit 3471520

Browse files
committed
[ARM] Allow tail predication of VLDn
VLD2/4 instructions cannot be predicated, so we cannot tail predicate them from autovec. From intrinsics though, they should be valid as they will just end up loading extra values into off vector lanes, not effecting the on lanes. The same is true for loads in general where so long as we are not using the other vector lanes, an unpredicated load can be converted to a predicated one. This marks VLD2 and VLD4 instructions as validForTailPredication and allows any unpredicated load in tail predication loop, which seems to be valid given the other checks we have. Differential Revision: https://reviews.llvm.org/D86022
1 parent 7baed76 commit 3471520

File tree

4 files changed

+64
-45
lines changed

4 files changed

+64
-45
lines changed

llvm/lib/Target/ARM/ARMInstrMVE.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5829,6 +5829,7 @@ class MVE_vldst24_base<bit writeback, bit fourregs, bits<2> stage, bits<2> size,
58295829
let mayLoad = load;
58305830
let mayStore = !eq(load,0);
58315831
let hasSideEffects = 0;
5832+
let validForTailPredication = load;
58325833
}
58335834

58345835
// A parameter class used to encapsulate all the ways the writeback

llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -782,7 +782,7 @@ bool LowOverheadLoop::ValidateLiveOuts() {
782782
// the false lanes are zeroed and here we're trying to track that those false
783783
// lanes remain zero, or where they change, the differences are masked away
784784
// by their user(s).
785-
// All MVE loads and stores have to be predicated, so we know that any load
785+
// All MVE stores have to be predicated, so we know that any predicate load
786786
// operands, or stored results are equivalent already. Other explicitly
787787
// predicated instructions will perform the same operation in the original
788788
// loop and the tail-predicated form too. Because of this, we can insert
@@ -1038,8 +1038,8 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
10381038
}
10391039

10401040
// If the instruction is already explicitly predicated, then the conversion
1041-
// will be fine, but ensure that all memory operations are predicated.
1042-
return !IsUse && MI->mayLoadOrStore() ? false : true;
1041+
// will be fine, but ensure that all store operations are predicated.
1042+
return !IsUse && MI->mayStore() ? false : true;
10431043
}
10441044

10451045
bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {

llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll

Lines changed: 10 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,26 +6,17 @@ define void @arm_cmplx_mag_squared_q15_mve(i16* %pSrc, i16* %pDst, i32 %blockSiz
66
; CHECK: @ %bb.0: @ %entry
77
; CHECK-NEXT: push {r7, lr}
88
; CHECK-NEXT: subs.w r12, r2, #8
9-
; CHECK-NEXT: mov.w r3, #-1
10-
; CHECK-NEXT: csinv r3, r3, r12, pl
11-
; CHECK-NEXT: add.w r12, r3, r2
12-
; CHECK-NEXT: movs r3, #1
13-
; CHECK-NEXT: add.w lr, r3, r12, lsr #3
14-
; CHECK-NEXT: dls lr, lr
9+
; CHECK-NEXT: dlstp.16 lr, r2
1510
; CHECK-NEXT: .LBB0_1: @ %do.body
1611
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1712
; CHECK-NEXT: vld20.16 {q0, q1}, [r0]
18-
; CHECK-NEXT: vctp.16 r2
19-
; CHECK-NEXT: subs r2, #8
2013
; CHECK-NEXT: vld21.16 {q0, q1}, [r0]!
21-
; CHECK-NEXT: vpstttt
22-
; CHECK-NEXT: vmulht.s16 q2, q1, q1
23-
; CHECK-NEXT: vmulht.s16 q0, q0, q0
24-
; CHECK-NEXT: vqaddt.s16 q0, q0, q2
25-
; CHECK-NEXT: vshrt.s16 q0, q0, #1
26-
; CHECK-NEXT: vpst
27-
; CHECK-NEXT: vstrht.16 q0, [r1], #16
28-
; CHECK-NEXT: le lr, .LBB0_1
14+
; CHECK-NEXT: vmulh.s16 q2, q1, q1
15+
; CHECK-NEXT: vmulh.s16 q0, q0, q0
16+
; CHECK-NEXT: vqadd.s16 q0, q0, q2
17+
; CHECK-NEXT: vshr.s16 q0, q0, #1
18+
; CHECK-NEXT: vstrh.16 q0, [r1], #16
19+
; CHECK-NEXT: letp lr, .LBB0_1
2920
; CHECK-NEXT: @ %bb.2: @ %do.end
3021
; CHECK-NEXT: pop {r7, pc}
3122
entry:
@@ -148,25 +139,14 @@ define i32 @good2(i32* nocapture readonly %x, i32* nocapture readonly %y, i32 %n
148139
; CHECK-LABEL: good2:
149140
; CHECK: @ %bb.0: @ %entry
150141
; CHECK-NEXT: push {r7, lr}
151-
; CHECK-NEXT: mov r3, r2
152-
; CHECK-NEXT: cmp r2, #4
153-
; CHECK-NEXT: it ge
154-
; CHECK-NEXT: movge r3, #4
155-
; CHECK-NEXT: subs r3, r2, r3
156-
; CHECK-NEXT: add.w r12, r3, #3
157-
; CHECK-NEXT: movs r3, #1
158-
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
159142
; CHECK-NEXT: mov.w r12, #0
160-
; CHECK-NEXT: dls lr, lr
143+
; CHECK-NEXT: dlstp.32 lr, r2
161144
; CHECK-NEXT: .LBB3_1: @ %do.body
162145
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
163-
; CHECK-NEXT: vctp.32 r2
164146
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
165147
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
166-
; CHECK-NEXT: subs r2, #4
167-
; CHECK-NEXT: vpst
168-
; CHECK-NEXT: vmlavat.s32 r12, q1, q0
169-
; CHECK-NEXT: le lr, .LBB3_1
148+
; CHECK-NEXT: vmlava.s32 r12, q1, q0
149+
; CHECK-NEXT: letp lr, .LBB3_1
170150
; CHECK-NEXT: @ %bb.2: @ %do.end
171151
; CHECK-NEXT: mov r0, r12
172152
; CHECK-NEXT: pop {r7, pc}

llvm/unittests/Target/ARM/MachineInstrTest.cpp

Lines changed: 50 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,7 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
382382
return false;
383383
case MVE_ASRLi:
384384
case MVE_ASRLr:
385-
case MVE_LSRL:
385+
case MVE_LSRL:
386386
case MVE_SQRSHR:
387387
case MVE_SQSHL:
388388
case MVE_SRSHR:
@@ -393,7 +393,7 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
393393
case MVE_VABDf32:
394394
case MVE_VABDs16:
395395
case MVE_VABDs32:
396-
case MVE_VABDs8:
396+
case MVE_VABDs8:
397397
case MVE_VABDu16:
398398
case MVE_VABDu32:
399399
case MVE_VABDu8:
@@ -609,6 +609,42 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
609609
case MVE_VIWDUPu16:
610610
case MVE_VIWDUPu32:
611611
case MVE_VIWDUPu8:
612+
case MVE_VLD20_8:
613+
case MVE_VLD21_8:
614+
case MVE_VLD20_16:
615+
case MVE_VLD21_16:
616+
case MVE_VLD20_32:
617+
case MVE_VLD21_32:
618+
case MVE_VLD20_8_wb:
619+
case MVE_VLD21_8_wb:
620+
case MVE_VLD20_16_wb:
621+
case MVE_VLD21_16_wb:
622+
case MVE_VLD20_32_wb:
623+
case MVE_VLD21_32_wb:
624+
case MVE_VLD40_8:
625+
case MVE_VLD41_8:
626+
case MVE_VLD42_8:
627+
case MVE_VLD43_8:
628+
case MVE_VLD40_16:
629+
case MVE_VLD41_16:
630+
case MVE_VLD42_16:
631+
case MVE_VLD43_16:
632+
case MVE_VLD40_32:
633+
case MVE_VLD41_32:
634+
case MVE_VLD42_32:
635+
case MVE_VLD43_32:
636+
case MVE_VLD40_8_wb:
637+
case MVE_VLD41_8_wb:
638+
case MVE_VLD42_8_wb:
639+
case MVE_VLD43_8_wb:
640+
case MVE_VLD40_16_wb:
641+
case MVE_VLD41_16_wb:
642+
case MVE_VLD42_16_wb:
643+
case MVE_VLD43_16_wb:
644+
case MVE_VLD40_32_wb:
645+
case MVE_VLD41_32_wb:
646+
case MVE_VLD42_32_wb:
647+
case MVE_VLD43_32_wb:
612648
case MVE_VLDRBS16:
613649
case MVE_VLDRBS16_post:
614650
case MVE_VLDRBS16_pre:
@@ -657,9 +693,9 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
657693
case MVE_VLDRWU32_rq_u:
658694
case MVE_VMOVimmf32:
659695
case MVE_VMOVimmi16:
660-
case MVE_VMOVimmi32:
696+
case MVE_VMOVimmi32:
661697
case MVE_VMOVimmi64:
662-
case MVE_VMOVimmi8:
698+
case MVE_VMOVimmi8:
663699
case MVE_VMOVNi16bh:
664700
case MVE_VMOVNi16th:
665701
case MVE_VMOVNi32bh:
@@ -679,7 +715,7 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
679715
case MVE_VMULLTs8:
680716
case MVE_VMULLTu16:
681717
case MVE_VMULLTu32:
682-
case MVE_VMULLTu8:
718+
case MVE_VMULLTu8:
683719
case MVE_VMUL_qr_f16:
684720
case MVE_VMUL_qr_f32:
685721
case MVE_VMUL_qr_i16:
@@ -702,7 +738,7 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
702738
case MVE_VORR:
703739
case MVE_VORRimmi16:
704740
case MVE_VORRimmi32:
705-
case MVE_VPST:
741+
case MVE_VPST:
706742
case MVE_VQABSs16:
707743
case MVE_VQABSs32:
708744
case MVE_VQABSs8:
@@ -814,7 +850,7 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
814850
case MVE_VRHADDs32:
815851
case MVE_VRHADDs8:
816852
case MVE_VRHADDu16:
817-
case MVE_VRHADDu32:
853+
case MVE_VRHADDu32:
818854
case MVE_VRHADDu8:
819855
case MVE_VRINTf16A:
820856
case MVE_VRINTf16M:
@@ -825,12 +861,12 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
825861
case MVE_VRINTf32A:
826862
case MVE_VRINTf32M:
827863
case MVE_VRINTf32N:
828-
case MVE_VRINTf32P:
829-
case MVE_VRINTf32X:
864+
case MVE_VRINTf32P:
865+
case MVE_VRINTf32X:
830866
case MVE_VRINTf32Z:
831867
case MVE_VRSHL_by_vecs16:
832868
case MVE_VRSHL_by_vecs32:
833-
case MVE_VRSHL_by_vecs8:
869+
case MVE_VRSHL_by_vecs8:
834870
case MVE_VRSHL_by_vecu16:
835871
case MVE_VRSHL_by_vecu32:
836872
case MVE_VRSHL_by_vecu8:
@@ -887,7 +923,7 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
887923
case MVE_VSTRB16_rq:
888924
case MVE_VSTRB32:
889925
case MVE_VSTRB32_post:
890-
case MVE_VSTRB32_pre:
926+
case MVE_VSTRB32_pre:
891927
case MVE_VSTRB32_rq:
892928
case MVE_VSTRB8_rq:
893929
case MVE_VSTRBU8:
@@ -957,7 +993,9 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
957993
for (auto &Op : Desc.operands()) {
958994
// Only check instructions that access the MQPR regs.
959995
if ((Op.OperandType & MCOI::OPERAND_REGISTER) == 0 ||
960-
Op.RegClass != ARM::MQPRRegClassID)
996+
(Op.RegClass != ARM::MQPRRegClassID &&
997+
Op.RegClass != ARM::QQPRRegClassID &&
998+
Op.RegClass != ARM::QQQQPRRegClassID))
961999
continue;
9621000

9631001
uint64_t Flags = MII->get(i).TSFlags;

0 commit comments

Comments
 (0)