From 312054a25723f139bfa7ab58c94958e14debd580 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Tue, 24 Aug 2021 13:56:01 -0700
Subject: [PATCH] [AArch64][GlobalISel] Fix incorrect handling of fp truncating
 stores.

When the tablegen patterns fail to select a truncating scalar FPR store,
our manual selection code also failed to handle it silently, trying to
generate an invalid copy. Fix this by adding support in the manual code
to generate a proper subreg copy before selecting a non-truncating store.

(cherry-picked from 04fb9b729a53a5f513328159590d86d28336a6da)
---
 .../GISel/AArch64InstructionSelector.cpp      |  42 +++++--
 .../select-store-truncating-float.mir         | 116 ++++++++++++++++++
 .../select-with-no-legality-check.mir         |   8 +-
 3 files changed, 151 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/select-store-truncating-float.mir
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index a17bd1b14f412..7980ddb26491b 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -22,6 +22,7 @@
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -2701,8 +2702,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_ZEXTLOAD:
   case TargetOpcode::G_LOAD:
   case TargetOpcode::G_STORE: {
+    GLoadStore &LdSt = cast<GLoadStore>(I);
     bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
-    LLT PtrTy = MRI.getType(I.getOperand(1).getReg());
+    LLT PtrTy = MRI.getType(LdSt.getPointerReg());
 
     if (PtrTy != LLT::pointer(0, 64)) {
       LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
@@ -2710,20 +2712,19 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       return false;
     }
 
-    auto &MemOp = **I.memoperands_begin();
-    uint64_t MemSizeInBytes = MemOp.getSize();
-    unsigned MemSizeInBits = MemSizeInBytes * 8;
-    AtomicOrdering Order = MemOp.getSuccessOrdering();
+    uint64_t MemSizeInBytes = LdSt.getMemSize();
+    unsigned MemSizeInBits = LdSt.getMemSizeInBits();
+    AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
 
     // Need special instructions for atomics that affect ordering.
     if (Order != AtomicOrdering::NotAtomic &&
         Order != AtomicOrdering::Unordered &&
         Order != AtomicOrdering::Monotonic) {
-      assert(I.getOpcode() != TargetOpcode::G_ZEXTLOAD);
+      assert(!isa<GZExtLoad>(LdSt));
       if (MemSizeInBytes > 64)
         return false;
 
-      if (I.getOpcode() == TargetOpcode::G_LOAD) {
+      if (isa<GLoad>(LdSt)) {
         static unsigned Opcodes[] = {AArch64::LDARB, AArch64::LDARH,
                                      AArch64::LDARW, AArch64::LDARX};
         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
@@ -2737,7 +2738,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     }
 
 #ifndef NDEBUG
-    const Register PtrReg = I.getOperand(1).getReg();
+    const Register PtrReg = LdSt.getPointerReg();
     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
     // Sanity-check the pointer register.
     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
@@ -2746,13 +2747,31 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
            "Load/Store pointer operand isn't a pointer");
 #endif
 
-    const Register ValReg = I.getOperand(0).getReg();
+    const Register ValReg = LdSt.getReg(0);
+    const LLT ValTy = MRI.getType(ValReg);
     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
 
+    // The code below doesn't support truncating stores, so we need to split it
+    // again.
+    if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
+      unsigned SubReg;
+      LLT MemTy = LdSt.getMMO().getMemoryType();
+      auto *RC = getRegClassForTypeOnBank(MemTy, RB, RBI);
+      if (!getSubRegForClass(RC, TRI, SubReg))
+        return false;
+
+      // Generate a subreg copy.
+      auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {})
+                      .addReg(ValReg, 0, SubReg)
+                      .getReg(0);
+      RBI.constrainGenericRegister(Copy, *RC, MRI);
+      LdSt.getOperand(0).setReg(Copy);
+    }
+
     // Helper lambda for partially selecting I. Either returns the original
     // instruction with an updated opcode, or a new instruction.
     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
-      bool IsStore = I.getOpcode() == TargetOpcode::G_STORE;
+      bool IsStore = isa<GStore>(I);
       const unsigned NewOpc =
           selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
       if (NewOpc == I.getOpcode())
@@ -2769,7 +2788,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
 
       // Folded something. Create a new instruction and return it.
       auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
-      IsStore ? NewInst.addUse(ValReg) : NewInst.addDef(ValReg);
+      Register CurValReg = I.getOperand(0).getReg();
+      IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
       NewInst.cloneMemRefs(I);
       for (auto &Fn : *AddrModeFns)
         Fn(NewInst);
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-store-truncating-float.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-store-truncating-float.mir
new file mode 100644
index 0000000000000..3e06016f4af53
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-store-truncating-float.mir
@@ -0,0 +1,116 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+--- |
+  define void @truncating_f32(double %x) {
+    %alloca = alloca i32, align 4
+    %bitcast = bitcast double %x to i64
+    %trunc = trunc i64 %bitcast to i32
+    store i32 %trunc, i32* %alloca, align 4
+    ret void
+  }
+
+  define void @truncating_f16(double %x) {
+    %alloca = alloca i16, align 2
+    %bitcast = bitcast double %x to i64
+    %trunc = trunc i64 %bitcast to i16
+    store i16 %trunc, i16* %alloca, align 2
+    ret void
+  }
+
+  define void @truncating_f8(double %x) {
+    %alloca = alloca i8, align 1
+    %bitcast = bitcast double %x to i64
+    %trunc = trunc i64 %bitcast to i8
+    store i8 %trunc, i8* %alloca, align 1
+    ret void
+  }
+
+...
+---
+name:            truncating_f32
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, name: alloca, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $d0
+
+    ; CHECK-LABEL: name: truncating_f32
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr32 = COPY [[COPY]].ssub
+    ; CHECK: STRSui [[COPY1]], %stack.0.alloca, 0 :: (store (s32) into %ir.alloca)
+    ; CHECK: RET_ReallyLR
+    %0:fpr(s64) = COPY $d0
+    %1:gpr(p0) = G_FRAME_INDEX %stack.0.alloca
+    G_STORE %0(s64), %1(p0) :: (store (s32) into %ir.alloca)
+    RET_ReallyLR
+
+...
+---
+name:            truncating_f16
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+frameInfo:
+  maxAlignment:    2
+stack:
+  - { id: 0, name: alloca, size: 2, alignment: 2 }
+machineFunctionInfo: {}
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $d0
+
+    ; CHECK-LABEL: name: truncating_f16
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr16 = COPY [[COPY]].hsub
+    ; CHECK: STRHui [[COPY1]], %stack.0.alloca, 0 :: (store (s16) into %ir.alloca)
+    ; CHECK: RET_ReallyLR
+    %0:fpr(s64) = COPY $d0
+    %1:gpr(p0) = G_FRAME_INDEX %stack.0.alloca
+    G_STORE %0(s64), %1(p0) :: (store (s16) into %ir.alloca)
+    RET_ReallyLR
+
+...
+---
+name:            truncating_f8
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+frameInfo:
+  maxAlignment:    1
+stack:
+  - { id: 0, name: alloca, size: 1, alignment: 1 }
+machineFunctionInfo: {}
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $d0
+
+    ; CHECK-LABEL: name: truncating_f8
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr16 = COPY [[COPY]].hsub
+    ; CHECK: [[COPY2:%[0-9]+]]:fpr8 = COPY [[COPY1]]
+    ; CHECK: STRBui [[COPY2]], %stack.0.alloca, 0 :: (store (s8) into %ir.alloca)
+    ; CHECK: RET_ReallyLR
+    %0:fpr(s64) = COPY $d0
+    %1:gpr(p0) = G_FRAME_INDEX %stack.0.alloca
+    G_STORE %0(s64), %1(p0) :: (store (s8) into %ir.alloca)
+    RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-with-no-legality-check.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-with-no-legality-check.mir
index e0983dbfdd14c..9d044d18b8c24 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-with-no-legality-check.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-with-no-legality-check.mir
@@ -278,13 +278,13 @@ body:             |
     ; CHECK-LABEL: name: test_rule96_id2146_at_idx8070
     ; CHECK: liveins: $x0
     ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
-    ; CHECK: [[LDRBui:%[0-9]+]]:fpr8 = LDRBui [[COPY]], 0 :: (load (s1))
+    ; CHECK: [[LDRBui:%[0-9]+]]:fpr8 = LDRBui [[COPY]], 0 :: (load (s8))
     ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY [[LDRBui]]
-    ; CHECK: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 0, 0
+    ; CHECK: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 0, 7
     ; CHECK: $noreg = PATCHABLE_RET [[UBFMWri]]
     %2:gpr(p0) = COPY $x0
-    %0:fpr(s1) = G_LOAD %2(p0) :: (load (s1))
-    %1:gpr(s32) = G_ZEXT %0(s1)
+    %0:fpr(s8) = G_LOAD %2(p0) :: (load (s8))
+    %1:gpr(s32) = G_ZEXT %0(s8)
     $noreg = PATCHABLE_RET %1(s32)
 
 ...