Skip to content

[AArch64][GlobalISel] Expand 64bit extracts to 128bit to allow more patterns #142904

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 29 additions & 3 deletions llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,26 @@ void AArch64RegisterBankInfo::applyMappingImpl(
MI.getOperand(1).setReg(ConstReg);
return applyDefaultMapping(OpdMapper);
}
case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
// SDAG will promote a 64bit G_EXTRACT_VECTOR_ELT to 128 to reduce the
// number of duplicate lane-extract patterns needed. Do the same here so
// that selection will operate on the larger vectors.
Register Src = MI.getOperand(1).getReg();
LLT SrcTy = MRI.getType(Src);
assert(SrcTy.getSizeInBits() == 64 && "Expected 64-bit source vector");
LLT DstTy = SrcTy.multiplyElements(2);
Builder.setInsertPt(*MI.getParent(), MI.getIterator());
auto Undef = Builder.buildUndef(SrcTy);
auto Concat = Builder.buildConcatVectors(DstTy, {Src, Undef.getReg(0)});
MRI.setRegBank(Undef.getReg(0), getRegBank(AArch64::FPRRegBankID));
MRI.setRegBank(Concat.getReg(0), getRegBank(AArch64::FPRRegBankID));
for (MachineInstr &Ext :
make_early_inc_range(MRI.use_nodbg_instructions(Src))) {
if (Ext.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT)
Ext.getOperand(1).setReg(Concat.getReg(0));
}
return applyDefaultMapping(OpdMapper);
}
default:
llvm_unreachable("Don't know how to handle that operation");
}
Expand Down Expand Up @@ -1014,14 +1034,20 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
break;
}
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
// Destination and source need to be FPRs.
OpRegBankIdx[0] = PMI_FirstFPR;
OpRegBankIdx[1] = PMI_FirstFPR;

// Index needs to be a GPR.
// Index needs to be a GPR constant.
OpRegBankIdx[2] = PMI_FirstGPR;
// SDAG will promote a 64bit G_EXTRACT_VECTOR_ELT to 128 to reduce the
// number of duplicate lane-extract patterns needed. Do the same here so
// that selection will operate on the larger vectors.
LLT Ty = MRI.getType(MI.getOperand(1).getReg());
if (!Ty.isScalable() && Ty.getSizeInBits() == 64)
MappingID = CustomMappingID;
break;
}
case TargetOpcode::G_INSERT_VECTOR_ELT:
OpRegBankIdx[0] = PMI_FirstFPR;
OpRegBankIdx[1] = PMI_FirstFPR;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr(<4 x s16>) = COPY $d0
; CHECK-NEXT: [[C:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 1
; CHECK-NEXT: [[EVEC:%[0-9]+]]:fpr(s16) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s16>), [[C]](s64)
; CHECK-NEXT: [[DEF:%[0-9]+]]:fpr(<4 x s16>) = G_IMPLICIT_DEF
; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:fpr(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<4 x s16>), [[DEF]](<4 x s16>)
; CHECK-NEXT: [[EVEC:%[0-9]+]]:fpr(s16) = G_EXTRACT_VECTOR_ELT [[CONCAT_VECTORS]](<8 x s16>), [[C]](s64)
; CHECK-NEXT: $h0 = COPY [[EVEC]](s16)
; CHECK-NEXT: RET_ReallyLR implicit $h0
%0:_(<4 x s16>) = COPY $d0
Expand Down
3 changes: 3 additions & 0 deletions llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
;
; CHECK-GI-LABEL: test_bitf_v1i32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: fmov w8, s2
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: fmov w10, s0
Expand Down
3 changes: 3 additions & 0 deletions llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
;
; CHECK-GI-LABEL: test_bit_v1i32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: fmov w8, s2
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: fmov w10, s0
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AArch64/abs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ define <1 x i32> @abs_v1i32(<1 x i32> %a){
;
; CHECK-GI-LABEL: abs_v1i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: cmp w8, #0
Expand Down
13 changes: 8 additions & 5 deletions llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1215,6 +1215,7 @@ define <8 x i8> @testDUP.v1i8(<1 x i8> %a) {
;
; CHECK-GI-LABEL: testDUP.v1i8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: dup v0.8b, w8
; CHECK-GI-NEXT: ret
Expand Down Expand Up @@ -1710,7 +1711,7 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
; CHECK-GI-NEXT: mov v2.16b, v1.16b
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: adrp x8, .LCPI127_0
; CHECK-GI-NEXT: mov v1.b[0], v0.b[0]
; CHECK-GI-NEXT: mov b1, v0.b[0]
; CHECK-GI-NEXT: mov v1.b[1], v0.b[1]
; CHECK-GI-NEXT: mov v1.b[2], v0.b[2]
; CHECK-GI-NEXT: mov v1.b[3], v0.b[3]
Expand Down Expand Up @@ -1817,7 +1818,7 @@ define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 {
; CHECK-GI-LABEL: test_concat_v16i8_v8i8_v8i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
; CHECK-GI-NEXT: mov b2, v0.b[0]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: mov v2.b[1], v0.b[1]
; CHECK-GI-NEXT: mov v2.b[2], v0.b[2]
Expand Down Expand Up @@ -1903,7 +1904,7 @@ define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
; CHECK-GI-NEXT: mov v2.16b, v1.16b
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: adrp x8, .LCPI131_0
; CHECK-GI-NEXT: mov v1.h[0], v0.h[0]
; CHECK-GI-NEXT: mov h1, v0.h[0]
; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
Expand Down Expand Up @@ -1974,7 +1975,7 @@ define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 {
; CHECK-GI-LABEL: test_concat_v8i16_v4i16_v4i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov v2.h[0], v0.h[0]
; CHECK-GI-NEXT: mov h2, v0.h[0]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: mov v2.h[1], v0.h[1]
; CHECK-GI-NEXT: mov v2.h[2], v0.h[2]
Expand Down Expand Up @@ -2036,7 +2037,7 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
; CHECK-GI-NEXT: mov v2.16b, v1.16b
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: adrp x8, .LCPI135_0
; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
; CHECK-GI-NEXT: mov s1, v0.s[0]
; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI135_0]
; CHECK-GI-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b
Expand Down Expand Up @@ -2242,6 +2243,7 @@ define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) {
;
; CHECK-GI-LABEL: concat_vector_v8i8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: dup v0.8b, w8
; CHECK-GI-NEXT: ret
Expand All @@ -2268,6 +2270,7 @@ define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) {
;
; CHECK-GI-LABEL: concat_vector_v16i8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: dup v0.16b, w8
; CHECK-GI-NEXT: ret
Expand Down
45 changes: 15 additions & 30 deletions llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
Original file line number Diff line number Diff line change
Expand Up @@ -614,16 +614,11 @@ entry:
}

define void @test_vst1_lane0_s16(ptr %a, <4 x i16> %b) {
; CHECK-GI-LABEL: test_vst1_lane0_s16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: str h0, [x0]
; CHECK-GI-NEXT: ret
;
; CHECK-SD-LABEL: test_vst1_lane0_s16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: str h0, [x0]
; CHECK-SD-NEXT: ret
; CHECK-LABEL: test_vst1_lane0_s16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: str h0, [x0]
; CHECK-NEXT: ret
entry:
%0 = extractelement <4 x i16> %b, i32 0
store i16 %0, ptr %a, align 2
Expand All @@ -643,16 +638,11 @@ entry:
}

define void @test_vst1_lane0_s32(ptr %a, <2 x i32> %b) {
; CHECK-GI-LABEL: test_vst1_lane0_s32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: str s0, [x0]
; CHECK-GI-NEXT: ret
;
; CHECK-SD-LABEL: test_vst1_lane0_s32:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: str s0, [x0]
; CHECK-SD-NEXT: ret
; CHECK-LABEL: test_vst1_lane0_s32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: str s0, [x0]
; CHECK-NEXT: ret
entry:
%0 = extractelement <2 x i32> %b, i32 0
store i32 %0, ptr %a, align 4
Expand Down Expand Up @@ -683,16 +673,11 @@ entry:
}

define void @test_vst1_lane0_f32(ptr %a, <2 x float> %b) {
; CHECK-GI-LABEL: test_vst1_lane0_f32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: str s0, [x0]
; CHECK-GI-NEXT: ret
;
; CHECK-SD-LABEL: test_vst1_lane0_f32:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: str s0, [x0]
; CHECK-SD-NEXT: ret
; CHECK-LABEL: test_vst1_lane0_f32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: str s0, [x0]
; CHECK-NEXT: ret
entry:
%0 = extractelement <2 x float> %b, i32 0
store float %0, ptr %a, align 4
Expand Down
55 changes: 19 additions & 36 deletions llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
Original file line number Diff line number Diff line change
Expand Up @@ -663,24 +663,14 @@ entry:
}

define i32 @test_vqrdmlahs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) {
; CHECK-SD-LABEL: test_vqrdmlahs_lane_s32:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fmov s1, w0
; CHECK-SD-NEXT: fmov s2, w1
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: sqrdmlah s1, s2, v0.s[1]
; CHECK-SD-NEXT: fmov w0, s1
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_vqrdmlahs_lane_s32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: fmov s1, w0
; CHECK-GI-NEXT: fmov s2, w1
; CHECK-GI-NEXT: mov s0, v0.s[1]
; CHECK-GI-NEXT: sqrdmlah s1, s2, s0
; CHECK-GI-NEXT: fmov w0, s1
; CHECK-GI-NEXT: ret
; CHECK-LABEL: test_vqrdmlahs_lane_s32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov s1, w0
; CHECK-NEXT: fmov s2, w1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: sqrdmlah s1, s2, v0.s[1]
; CHECK-NEXT: fmov w0, s1
; CHECK-NEXT: ret
entry:
%vget_lane = extractelement <2 x i32> %c, i64 1
%vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %vget_lane) #4
Expand Down Expand Up @@ -813,24 +803,14 @@ entry:
}

define i32 @test_vqrdmlshs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) {
; CHECK-SD-LABEL: test_vqrdmlshs_lane_s32:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fmov s1, w0
; CHECK-SD-NEXT: fmov s2, w1
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: sqrdmlsh s1, s2, v0.s[1]
; CHECK-SD-NEXT: fmov w0, s1
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_vqrdmlshs_lane_s32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: fmov s1, w0
; CHECK-GI-NEXT: fmov s2, w1
; CHECK-GI-NEXT: mov s0, v0.s[1]
; CHECK-GI-NEXT: sqrdmlsh s1, s2, s0
; CHECK-GI-NEXT: fmov w0, s1
; CHECK-GI-NEXT: ret
; CHECK-LABEL: test_vqrdmlshs_lane_s32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov s1, w0
; CHECK-NEXT: fmov s2, w1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: sqrdmlsh s1, s2, v0.s[1]
; CHECK-NEXT: fmov w0, s1
; CHECK-NEXT: ret
entry:
%vget_lane = extractelement <2 x i32> %c, i64 1
%vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vget_lane) #4
Expand Down Expand Up @@ -867,3 +847,6 @@ entry:
%vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vgetq_lane) #4
ret i32 %vqrdmlshs_s32.i
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK-GI: {{.*}}
; CHECK-SD: {{.*}}
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ define half @test_vcvt_f16_f32(<1 x float> %x) {
;
; GISEL-LABEL: test_vcvt_f16_f32:
; GISEL: // %bb.0:
; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0
; GISEL-NEXT: fcvt h0, s0
; GISEL-NEXT: ret
%tmp = fptrunc <1 x float> %x to <1 x half>
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AArch64/bswap.ll
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ define <1 x i32> @bswap_v1i32(<1 x i32> %a){
;
; CHECK-GI-LABEL: bswap_v1i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: rev w8, w8
; CHECK-GI-NEXT: fmov s0, w8
Expand Down
7 changes: 3 additions & 4 deletions llvm/test/CodeGen/AArch64/concat-vector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@ define <4 x i8> @concat1(<2 x i8> %A, <2 x i8> %B) {
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov w8, v0.s[1]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: mov w9, v1.s[1]
; CHECK-GI-NEXT: mov v0.h[1], w8
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: mov v0.h[2], w8
; CHECK-GI-NEXT: mov v0.h[3], w9
; CHECK-GI-NEXT: mov w8, v1.s[1]
; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
; CHECK-GI-NEXT: mov v0.h[3], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%v4i8 = shufflevector <2 x i8> %A, <2 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
Expand Down
18 changes: 6 additions & 12 deletions llvm/test/CodeGen/AArch64/double_reduct.ll
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,8 @@ define float @fmul_f32(<8 x float> %a, <4 x float> %b) {
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s
; CHECK-GI-NEXT: fmul v1.2s, v2.2s, v3.2s
; CHECK-GI-NEXT: mov s2, v0.s[1]
; CHECK-GI-NEXT: mov s3, v1.s[1]
; CHECK-GI-NEXT: fmul s0, s0, s2
; CHECK-GI-NEXT: fmul s1, s1, s3
; CHECK-GI-NEXT: fmul s0, s0, v0.s[1]
; CHECK-GI-NEXT: fmul s1, s1, v1.s[1]
; CHECK-GI-NEXT: fmul s0, s0, s1
; CHECK-GI-NEXT: ret
%r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
Expand All @@ -92,10 +90,8 @@ define float @fmul_f32_same(<4 x float> %a, <4 x float> %b) {
; CHECK-GI-NEXT: mov d3, v1.d[1]
; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s
; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s
; CHECK-GI-NEXT: mov s2, v0.s[1]
; CHECK-GI-NEXT: mov s3, v1.s[1]
; CHECK-GI-NEXT: fmul s0, s0, s2
; CHECK-GI-NEXT: fmul s1, s1, s3
; CHECK-GI-NEXT: fmul s0, s0, v0.s[1]
; CHECK-GI-NEXT: fmul s1, s1, v1.s[1]
; CHECK-GI-NEXT: fmul s0, s0, s1
; CHECK-GI-NEXT: ret
%r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
Expand Down Expand Up @@ -922,10 +918,8 @@ define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d)
; CHECK-GI-NEXT: mov d5, v1.d[1]
; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v4.2s
; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v5.2s
; CHECK-GI-NEXT: mov s4, v0.s[1]
; CHECK-GI-NEXT: mov s5, v1.s[1]
; CHECK-GI-NEXT: fmul s0, s0, s4
; CHECK-GI-NEXT: fmul s1, s1, s5
; CHECK-GI-NEXT: fmul s0, s0, v0.s[1]
; CHECK-GI-NEXT: fmul s1, s1, v1.s[1]
; CHECK-GI-NEXT: fmul s0, s0, s2
; CHECK-GI-NEXT: fmul s1, s1, s3
; CHECK-GI-NEXT: fmul s0, s0, s1
Expand Down
Loading
Loading