[ARM] Add early-clobber to MVE VCMLA.f32 (#114995)

ostannard · web-flow · commit 9b016e3cb285 · 2024-11-06T14:46:08.000Z
This instruction (but not the f16 variant) cannot us the same register
for the output as either of the inputs, so it needs to be marked as
early-clobber.
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -3583,10 +3583,10 @@ def ARMimmOneH: PatLeaf<(bitconvert (v8i16 (ARMvmovImm (i32 2620))))>; // 1.0 ha
 defm MVE_VMULf32 : MVE_VMUL_fp_m<MVE_v4f32, ARMimmOneF>;
 defm MVE_VMULf16 : MVE_VMUL_fp_m<MVE_v8f16, ARMimmOneH>;
 
-class MVE_VCMLA<string suffix, bits<2> size>
+class MVE_VCMLA<string suffix, bits<2> size, string cstr>
   : MVEFloatArithNeon<"vcmla", suffix, size{1}, (outs MQPR:$Qd),
                          (ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot),
-                         "$Qd, $Qn, $Qm, $rot", vpred_n, "$Qd = $Qd_src", size, []> {
+                         "$Qd, $Qn, $Qm, $rot", vpred_n, "$Qd = $Qd_src"#cstr, size, []> {
   bits<4> Qd;
   bits<4> Qn;
   bits<2> rot;
@@ -3603,8 +3603,8 @@ class MVE_VCMLA<string suffix, bits<2> size>
   let Inst{4} = 0b0;
 }
 
-multiclass MVE_VCMLA_m<MVEVectorVTInfo VTI> {
-  def "" : MVE_VCMLA<VTI.Suffix, VTI.Size>;
+multiclass MVE_VCMLA_m<MVEVectorVTInfo VTI, string cstr=""> {
+  def "" : MVE_VCMLA<VTI.Suffix, VTI.Size, cstr>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEFloat] in {
@@ -3633,7 +3633,7 @@ multiclass MVE_VCMLA_m<MVEVectorVTInfo VTI> {
 }
 
 defm MVE_VCMLAf16 : MVE_VCMLA_m<MVE_v8f16>;
-defm MVE_VCMLAf32 : MVE_VCMLA_m<MVE_v4f32>;
+defm MVE_VCMLAf32 : MVE_VCMLA_m<MVE_v4f32, ",@earlyclobber $Qd">;
 
 class MVE_VADDSUBFMA_fp<string iname, string suffix, bits<2> size, bit bit_4,
                         bit bit_8, bit bit_21, dag iops=(ins),
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmla.ll b/llvm/test/CodeGen/Thumb2/mve-vcmla.ll
@@ -121,3 +121,27 @@ entry:
   %res = fadd <4 x float> %d, %a
   ret <4 x float> %res
 }
+
+define arm_aapcs_vfpcc <8 x half> @same_register_f16(<8 x half> %a) {
+; CHECK-LABEL: same_register_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmla.f16 q0, q0, q0, #0
+; CHECK-NEXT:    bx lr
+entry:
+  %d = tail call <8 x half> @llvm.arm.mve.vcmlaq.v8f16(i32 0, <8 x half> zeroinitializer, <8 x half> %a, <8 x half> %a)
+  %res = fadd fast <8 x half> %d, %a
+  ret <8 x half> %res
+}
+
+define arm_aapcs_vfpcc <4 x float> @same_register_f32(<4 x float> %a) {
+; CHECK-LABEL: same_register_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vcmla.f32 q1, q0, q0, #0
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %d = tail call <4 x float> @llvm.arm.mve.vcmlaq.v4f32(i32 0, <4 x float> zeroinitializer, <4 x float> %a, <4 x float> %a)
+  %res = fadd fast <4 x float> %d, %a
+  ret <4 x float> %res
+}