Skip to content
This repository was archived by the owner on Apr 23, 2020. It is now read-only.

Commit 51e92d1

Browse files
committed
[AArch64] Improve add/sub/cmp isel of uxtw forms.
Don't match the UXTW extended reg forms of ADD/ADDS/SUB/SUBS if the 32-bit to 64-bit zero-extend can be done for free by taking advantage of the 32-bit defining instruction zeroing the upper 32-bits of the X register destination. This enables better instruction selection in a few cases, such as: sub x0, xzr, x8 instead of: mov x8, xzr sub x0, x8, w9, uxtw madd x0, x1, x1, x8 instead of: mul x9, x1, x1 add x0, x9, w8, uxtw cmp x2, x8 instead of: sub x8, x2, w8, uxtw cmp x8, #0 add x0, x8, x1, lsl #3 instead of: lsl x9, x1, #3 add x0, x9, w8, uxtw Reviewers: t.p.northover, jmolloy Subscribers: mcrosier, aemerson, llvm-commits, rengolin Differential Revision: https://reviews.llvm.org/D24747 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@282413 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 653861f commit 51e92d1

File tree

4 files changed

+124
-14
lines changed

4 files changed

+124
-14
lines changed

lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -586,6 +586,11 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
586586
return false;
587587

588588
Reg = N.getOperand(0);
589+
590+
// Don't match if free 32-bit -> 64-bit zext can be used instead.
591+
if (Ext == AArch64_AM::UXTW &&
592+
Reg->getValueType(0).getSizeInBits() == 32 && isDef32(*Reg.getNode()))
593+
return false;
589594
}
590595

591596
// AArch64 mandates that the RHS of the operation must use the smallest

lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,21 @@ enum NodeType : unsigned {
215215

216216
} // end namespace AArch64ISD
217217

218+
namespace {
219+
220+
// Any instruction that defines a 32-bit result zeros out the high half of the
221+
// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
222+
// be copying from a truncate. But any other 32-bit operation will zero-extend
223+
// up to 64 bits.
224+
// FIXME: X86 also checks for CMOV here. Do we need something similar?
225+
static inline bool isDef32(const SDNode &N) {
226+
unsigned Opc = N.getOpcode();
227+
return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG &&
228+
Opc != ISD::CopyFromReg;
229+
}
230+
231+
} // end anonymous namespace
232+
218233
class AArch64Subtarget;
219234
class AArch64TargetMachine;
220235

lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5272,15 +5272,8 @@ def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0
52725272
//----------------------------------------------------------------------------
52735273
// FIXME: Like for X86, these should go in their own separate .td file.
52745274

5275-
// Any instruction that defines a 32-bit result leaves the high half of the
5276-
// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
5277-
// be copying from a truncate. But any other 32-bit operation will zero-extend
5278-
// up to 64 bits.
5279-
// FIXME: X86 also checks for CMOV here. Do we need something similar?
52805275
def def32 : PatLeaf<(i32 GPR32:$src), [{
5281-
return N->getOpcode() != ISD::TRUNCATE &&
5282-
N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
5283-
N->getOpcode() != ISD::CopyFromReg;
5276+
return isDef32(*N);
52845277
}]>;
52855278

52865279
// In the case of a 32-bit def that is known to implicitly zero-extend,

test/CodeGen/AArch64/addsub_ext.ll

Lines changed: 103 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -274,19 +274,20 @@ define void @sub_i16rhs() minsize {
274274
; N.b. we could probably check more here ("add w2, w3, w1, uxtw" for
275275
; example), but the remaining instructions are probably not idiomatic
276276
; in the face of "add/sub (shifted register)" so I don't intend to.
277-
define void @addsub_i32rhs() minsize {
277+
define void @addsub_i32rhs(i32 %in32) minsize {
278278
; CHECK-LABEL: addsub_i32rhs:
279279
%val32_tmp = load i32, i32* @var32
280280
%lhs64 = load i64, i64* @var64
281281

282282
%val32 = add i32 %val32_tmp, 123
283283

284-
%rhs64_zext = zext i32 %val32 to i64
284+
%rhs64_zext = zext i32 %in32 to i64
285285
%res64_zext = add i64 %lhs64, %rhs64_zext
286286
store volatile i64 %res64_zext, i64* @var64
287287
; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw
288288

289-
%rhs64_zext_shift = shl i64 %rhs64_zext, 2
289+
%rhs64_zext2 = zext i32 %val32 to i64
290+
%rhs64_zext_shift = shl i64 %rhs64_zext2, 2
290291
%res64_zext_shift = add i64 %lhs64, %rhs64_zext_shift
291292
store volatile i64 %res64_zext_shift, i64* @var64
292293
; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw #2
@@ -304,19 +305,20 @@ define void @addsub_i32rhs() minsize {
304305
ret void
305306
}
306307

307-
define void @sub_i32rhs() minsize {
308+
define void @sub_i32rhs(i32 %in32) minsize {
308309
; CHECK-LABEL: sub_i32rhs:
309310
%val32_tmp = load i32, i32* @var32
310311
%lhs64 = load i64, i64* @var64
311312

312313
%val32 = add i32 %val32_tmp, 123
313314

314-
%rhs64_zext = zext i32 %val32 to i64
315+
%rhs64_zext = zext i32 %in32 to i64
315316
%res64_zext = sub i64 %lhs64, %rhs64_zext
316317
store volatile i64 %res64_zext, i64* @var64
317318
; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw
318319

319-
%rhs64_zext_shift = shl i64 %rhs64_zext, 2
320+
%rhs64_zext2 = zext i32 %val32 to i64
321+
%rhs64_zext_shift = shl i64 %rhs64_zext2, 2
320322
%res64_zext_shift = sub i64 %lhs64, %rhs64_zext_shift
321323
store volatile i64 %res64_zext_shift, i64* @var64
322324
; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw #2
@@ -333,3 +335,98 @@ define void @sub_i32rhs() minsize {
333335

334336
ret void
335337
}
338+
339+
; Check that implicit zext from w reg write is used instead of uxtw form of add.
340+
define i64 @add_fold_uxtw(i32 %x, i64 %y) {
341+
; CHECK-LABEL: add_fold_uxtw:
342+
entry:
343+
; CHECK: and w[[TMP:[0-9]+]], w0, #0x3
344+
%m = and i32 %x, 3
345+
%ext = zext i32 %m to i64
346+
; CHECK-NEXT: add x0, x1, x[[TMP]]
347+
%ret = add i64 %y, %ext
348+
ret i64 %ret
349+
}
350+
351+
; Check that implicit zext from w reg write is used instead of uxtw
352+
; form of sub and that mov WZR is folded to form a neg instruction.
353+
define i64 @sub_fold_uxtw_xzr(i32 %x) {
354+
; CHECK-LABEL: sub_fold_uxtw_xzr:
355+
entry:
356+
; CHECK: and w[[TMP:[0-9]+]], w0, #0x3
357+
%m = and i32 %x, 3
358+
%ext = zext i32 %m to i64
359+
; CHECK-NEXT: neg x0, x[[TMP]]
360+
%ret = sub i64 0, %ext
361+
ret i64 %ret
362+
}
363+
364+
; Check that implicit zext from w reg write is used instead of uxtw form of subs/cmp.
365+
define i1 @cmp_fold_uxtw(i32 %x, i64 %y) {
366+
; CHECK-LABEL: cmp_fold_uxtw:
367+
entry:
368+
; CHECK: and w[[TMP:[0-9]+]], w0, #0x3
369+
%m = and i32 %x, 3
370+
%ext = zext i32 %m to i64
371+
; CHECK-NEXT: cmp x1, x[[TMP]]
372+
; CHECK-NEXT: cset
373+
%ret = icmp eq i64 %y, %ext
374+
ret i1 %ret
375+
}
376+
377+
; Check that implicit zext from w reg write is used instead of uxtw
378+
; form of add, leading to madd selection.
379+
define i64 @madd_fold_uxtw(i32 %x, i64 %y) {
380+
; CHECK-LABEL: madd_fold_uxtw:
381+
entry:
382+
; CHECK: and w[[TMP:[0-9]+]], w0, #0x3
383+
%m = and i32 %x, 3
384+
%ext = zext i32 %m to i64
385+
; CHECK-NEXT: madd x0, x1, x1, x[[TMP]]
386+
%mul = mul i64 %y, %y
387+
%ret = add i64 %mul, %ext
388+
ret i64 %ret
389+
}
390+
391+
; Check that implicit zext from w reg write is used instead of uxtw
392+
; form of sub, leading to sub/cmp folding.
393+
; Check that implicit zext from w reg write is used instead of uxtw form of subs/cmp.
394+
define i1 @cmp_sub_fold_uxtw(i32 %x, i64 %y, i64 %z) {
395+
; CHECK-LABEL: cmp_sub_fold_uxtw:
396+
entry:
397+
; CHECK: and w[[TMP:[0-9]+]], w0, #0x3
398+
%m = and i32 %x, 3
399+
%ext = zext i32 %m to i64
400+
; CHECK-NEXT: cmp x[[TMP2:[0-9]+]], x[[TMP]]
401+
; CHECK-NEXT: cset
402+
%sub = sub i64 %z, %ext
403+
%ret = icmp eq i64 %sub, 0
404+
ret i1 %ret
405+
}
406+
407+
; Check that implicit zext from w reg write is used instead of uxtw
408+
; form of add and add of -1 gets selected as sub.
409+
define i64 @add_imm_fold_uxtw(i32 %x) {
410+
; CHECK-LABEL: add_imm_fold_uxtw:
411+
entry:
412+
; CHECK: and w[[TMP:[0-9]+]], w0, #0x3
413+
%m = and i32 %x, 3
414+
%ext = zext i32 %m to i64
415+
; CHECK-NEXT: sub x0, x[[TMP]], #1
416+
%ret = add i64 %ext, -1
417+
ret i64 %ret
418+
}
419+
420+
; Check that implicit zext from w reg write is used instead of uxtw
421+
; form of add and add lsl form gets selected.
422+
define i64 @add_lsl_fold_uxtw(i32 %x, i64 %y) {
423+
; CHECK-LABEL: add_lsl_fold_uxtw:
424+
entry:
425+
; CHECK: orr w[[TMP:[0-9]+]], w0, #0x3
426+
%m = or i32 %x, 3
427+
%ext = zext i32 %m to i64
428+
%shift = shl i64 %y, 3
429+
; CHECK-NEXT: add x0, x[[TMP]], x1, lsl #3
430+
%ret = add i64 %ext, %shift
431+
ret i64 %ret
432+
}

0 commit comments

Comments
 (0)