Skip to content

Commit 57216f7

Browse files
authored
[AMDGPU] Support byte_sel modifier for v_cvt_f32_fp8 and v_cvt_f32_bf8 (#90887)
1 parent d4a2597 commit 57216f7

14 files changed

+214
-89
lines changed

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8634,8 +8634,8 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
86348634
}
86358635

86368636
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) {
8637-
assert(AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in));
8638-
Inst.addOperand(Inst.getOperand(0));
8637+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in))
8638+
Inst.addOperand(Inst.getOperand(0));
86398639
addOptionalImmOperand(Inst, Operands, OptionalIdx,
86408640
AMDGPUOperand::ImmTyByteSel);
86418641
}

llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,8 @@ bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const {
140140
if (!hasNoImmOrEqual(MI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
141141
!hasNoImmOrEqual(MI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
142142
!hasNoImmOrEqual(MI, AMDGPU::OpName::clamp, 0) ||
143-
!hasNoImmOrEqual(MI, AMDGPU::OpName::omod, 0)) {
143+
!hasNoImmOrEqual(MI, AMDGPU::OpName::omod, 0) ||
144+
!hasNoImmOrEqual(MI, AMDGPU::OpName::byte_sel, 0)) {
144145
LLVM_DEBUG(dbgs() << " Inst has non-default modifiers\n");
145146
return false;
146147
}

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4459,7 +4459,8 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
44594459

44604460
// Check output modifiers
44614461
return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4462-
!hasModifiersSet(MI, AMDGPU::OpName::clamp);
4462+
!hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4463+
!hasModifiersSet(MI, AMDGPU::OpName::byte_sel);
44634464
}
44644465

44654466
// Set VCC operand with all flags from \p Orig, except for setting it as

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2306,8 +2306,9 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
23062306
field bit IsWMMA = 0;
23072307
field bit IsSWMMAC = 0;
23082308

2309-
field bit IsFP8 = 0;
2309+
field bit IsFP8SrcByteSel = 0;
23102310
field bit IsFP8DstByteSel = 0;
2311+
field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel);
23112312

23122313
field bit HasDst = !ne(DstVT.Value, untyped.Value);
23132314
field bit HasDst32 = HasDst;
@@ -2427,7 +2428,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
24272428
field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
24282429
field string AsmVOP3Base = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
24292430
HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, HasModifiers,
2430-
HasModifiers, DstVT, IsFP8DstByteSel>.ret;
2431+
HasModifiers, DstVT, IsFP8ByteSel>.ret;
24312432
field string Asm64 = AsmVOP3Base;
24322433
field string AsmVOP3P = getAsmVOP3P<NumSrcArgs, HasModifiers, HasClamp, HasOpSel>.ret;
24332434
field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 26 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -625,42 +625,44 @@ def VOPProfile_Base_CVT_PK_F32_F8_OpSel : VOPProfileI2F <v2f32, i32> {
625625
let HasExtVOP3DPP = 0;
626626
}
627627

628-
def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, untyped, untyped]> {
629-
let HasOpSel = 1;
628+
class VOPProfile_Base_CVT_F_F8_ByteSel<ValueType DstVT> : VOPProfile<[DstVT, i32, untyped, untyped]> {
629+
let IsFP8SrcByteSel = 1;
630+
let HasOpSel = 0;
630631
let HasExtDPP = 1;
631632
let HasExtVOP3DPP = 1;
632-
let IsFP8 = 1;
633+
let HasExtSDWA = 0;
633634
let HasClamp = 0;
634635
let HasOMod = 0;
635-
let HasModifiers = 1;
636-
let Src1VOP3DPP = Src1RC64;
636+
let HasModifiers = 0;
637+
638+
defvar bytesel = (ins ByteSel:$byte_sel);
639+
let Ins64 = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
640+
HasClamp, HasModifiers, HasSrc2Mods,
641+
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
642+
bytesel);
643+
let InsVOP3Base = !con(getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP, Src2VOP3DPP,
644+
NumSrcArgs, HasClamp, HasModifiers, HasSrc2Mods,
645+
HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
646+
Src2ModVOP3DPP, HasOpSel>.ret,
647+
bytesel);
637648
}
638649

639650
let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
640651
mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
641-
defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>;
642-
defm V_CVT_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>;
652+
defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
653+
defm V_CVT_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
643654
defm V_CVT_PK_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_fp8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
644655
defm V_CVT_PK_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_bf8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
645656
}
646657

647-
class Cvt_F32_F8_Pat_OpSel<SDPatternOperator node, bits<2> index,
648-
VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat<
649-
(f32 (node i32:$src, index)),
650-
!if (index,
651-
(inst_e64 !or(!if(index{0}, SRCMODS.OP_SEL_1, 0),
652-
!if(index{1}, SRCMODS.OP_SEL_0, 0)),
653-
$src, 0),
654-
(inst_e32 $src))
658+
class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst> : GCNPat<
659+
(node i32:$src0, timm:$byte_sel),
660+
(inst $src0, (as_i32timm $byte_sel))
655661
>;
656662

657663
let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts] in {
658-
foreach Index = [0, 1, 2, 3] in {
659-
def : Cvt_F32_F8_Pat_OpSel<int_amdgcn_cvt_f32_fp8, Index,
660-
V_CVT_F32_FP8_e32, V_CVT_F32_FP8_OP_SEL_e64>;
661-
def : Cvt_F32_F8_Pat_OpSel<int_amdgcn_cvt_f32_bf8, Index,
662-
V_CVT_F32_BF8_e32, V_CVT_F32_BF8_OP_SEL_e64>;
663-
}
664+
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_fp8, V_CVT_F32_FP8_OP_SEL_e64>;
665+
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_bf8, V_CVT_F32_BF8_OP_SEL_e64>;
664666
}
665667

666668
class Cvt_PK_F32_F8_Pat_OpSel<SDPatternOperator node, int index,
@@ -901,14 +903,11 @@ multiclass VOP1_Real_NO_DPP_OP_SEL_with_name<GFXGen Gen, bits<9> op,
901903
VOP3_Real_with_name<Gen, {0, 1, 1, op{6-0}}, opName, asmName>;
902904

903905

906+
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX12Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
907+
defm V_CVT_F32_BF8 : VOP1_Real_FULL_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;
908+
904909
// Define VOP1 instructions using the pseudo instruction with its old profile and
905910
// VOP3 using the OpSel profile for the pseudo instruction.
906-
defm V_CVT_F32_FP8 : VOP1_Real_NO_VOP3_with_name_gfx12<0x06c, "V_CVT_F32_FP8", "v_cvt_f32_fp8">;
907-
defm V_CVT_F32_FP8 : VOP1_Realtriple_e64_with_name<GFX12Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
908-
909-
defm V_CVT_F32_BF8 : VOP1_Real_NO_VOP3_with_name_gfx12<0x06d, "V_CVT_F32_BF8", "v_cvt_f32_bf8">;
910-
defm V_CVT_F32_BF8 : VOP1_Realtriple_e64_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;
911-
912911
defm V_CVT_PK_F32_FP8 : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8", "v_cvt_pk_f32_fp8">;
913912
defm V_CVT_PK_F32_FP8 : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_OP_SEL", "v_cvt_pk_f32_fp8">;
914913

llvm/lib/Target/AMDGPU/VOPInstructions.td

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -306,9 +306,10 @@ class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
306306

307307
class VOP3OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>;
308308

309-
class VOP3FP8OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
310-
let Inst{11} = !if(p.HasSrc0, src0_modifiers{2}, 0);
311-
let Inst{12} = !if(p.HasSrc0, src0_modifiers{3}, 0);
309+
class VOP3FP8OpSel_src_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
310+
bits<2> byte_sel;
311+
let Inst{11-12} = byte_sel; // NB: bit order is intentionally reversed!
312+
let Inst{14-13} = 0; // op_sel2/3
312313
}
313314

314315
class VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
@@ -755,10 +756,14 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
755756
let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
756757
let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
757758
// OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
758-
let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?);
759-
let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, !if((P.IsFP8), src0_modifiers{3}, 0)), ?);
760-
let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),!if(P.IsFP8DstByteSel, byte_sel{0}, ?));
761-
let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),!if(P.IsFP8DstByteSel, byte_sel{1}, ?));
759+
let Inst{11} = !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{2}, 0),
760+
!if(P.IsFP8SrcByteSel, byte_sel{1}, ?));
761+
let Inst{12} = !if(P.HasOpSel, !if(P.HasSrc1Mods, src1_modifiers{2}, 0),
762+
!if(P.IsFP8SrcByteSel, byte_sel{0}, ?));
763+
let Inst{13} = !if(P.HasOpSel, !if(P.HasSrc2Mods, src2_modifiers{2}, 0),
764+
!if(P.IsFP8DstByteSel, byte_sel{0}, ?));
765+
let Inst{14} = !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{3}, 0),
766+
!if(P.IsFP8DstByteSel, byte_sel{1}, ?));
762767
let Inst{15} = !if(P.HasClamp, clamp, 0);
763768
let Inst{25-16} = op;
764769
let Inst{31-26} = 0x35;
@@ -1397,7 +1402,11 @@ multiclass VOP3_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME,
13971402
bit isSingle = 0> {
13981403
defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
13991404
let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
1400-
if ps.Pfl.IsFP8DstByteSel then {
1405+
if ps.Pfl.IsFP8SrcByteSel then {
1406+
def _e64#Gen.Suffix :
1407+
VOP3_Real_Gen<ps, Gen>,
1408+
VOP3FP8OpSel_src_bytesel_gfx11_gfx12<op, ps.Pfl>;
1409+
} else if ps.Pfl.IsFP8DstByteSel then {
14011410
def _e64#Gen.Suffix :
14021411
VOP3_Real_Gen<ps, Gen>,
14031412
VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<op, ps.Pfl>;
@@ -1428,10 +1437,10 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
14281437
defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
14291438
let AsmString = asmName # ps.AsmOperands,
14301439
IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
1431-
if ps.Pfl.IsFP8 then {
1440+
if ps.Pfl.IsFP8SrcByteSel then {
14321441
def _e64#Gen.Suffix :
14331442
VOP3_Real_Gen<ps, Gen>,
1434-
VOP3FP8OpSel_gfx11_gfx12<op, ps.Pfl>;
1443+
VOP3FP8OpSel_src_bytesel_gfx11_gfx12<op, ps.Pfl>;
14351444
} else if ps.Pfl.IsFP8DstByteSel then {
14361445
def _e64#Gen.Suffix :
14371446
VOP3_Real_Gen<ps, Gen>,

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) {
1414
define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) {
1515
; GFX12-LABEL: test_cvt_f32_bf8_byte1:
1616
; GFX12: ; %bb.0:
17-
; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
18-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
19-
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1]
17+
; GFX12-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
2018
; GFX12-NEXT: ; return to shader part epilog
2119
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
2220
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 1)
@@ -26,9 +24,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) {
2624
define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) {
2725
; GFX12-LABEL: test_cvt_f32_bf8_byte2:
2826
; GFX12: ; %bb.0:
29-
; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
30-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
31-
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0]
27+
; GFX12-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
3228
; GFX12-NEXT: ; return to shader part epilog
3329
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
3430
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 2)
@@ -38,9 +34,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) {
3834
define amdgpu_cs float @test_cvt_f32_fp8_byte3(i32 %a) {
3935
; GFX12-LABEL: test_cvt_f32_fp8_byte3:
4036
; GFX12: ; %bb.0:
41-
; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
42-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
43-
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1]
37+
; GFX12-NEXT: v_cvt_f32_fp8_e64_dpp v0, v0 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
4438
; GFX12-NEXT: ; return to shader part epilog
4539
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
4640
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %tmp0, i32 3)

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,12 @@ body: |
1313
; GFX12-NEXT: {{ $}}
1414
; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
1515
; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
16-
; GFX12-NEXT: [[V_CVT_F32_BF8_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_dpp [[DEF]], [[COPY]], 228, 15, 15, 1, implicit $mode, implicit $exec
17-
; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_dpp]]
16+
; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_dpp [[DEF]], [[COPY]], 228, 15, 15, 1, implicit $mode, implicit $exec
17+
; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_dpp]]
1818
; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
1919
%0:vgpr_32 = COPY $vgpr0
2020
%1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
21-
%2:vgpr_32 = V_CVT_F32_BF8_e32 killed %1, implicit $mode, implicit $exec
21+
%2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e32 killed %1, implicit $mode, implicit $exec
2222
$vgpr0 = COPY %2
2323
SI_RETURN_TO_EPILOG $vgpr0
2424
@@ -34,13 +34,13 @@ body: |
3434
; GFX12: liveins: $vgpr0
3535
; GFX12-NEXT: {{ $}}
3636
; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
37-
; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY]], [[COPY]], 228, 15, 15, -1, implicit $exec
38-
; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed [[V_MOV_B32_dpp]], 0, implicit $mode, implicit $exec
39-
; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_]]
37+
; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
38+
; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64_dpp [[DEF]], [[COPY]], 2, 228, 15, 15, 1, implicit $mode, implicit $exec
39+
; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_dpp]]
4040
; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
4141
%0:vgpr_32 = COPY $vgpr0
4242
%1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
43-
%2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed %1, 0, implicit $mode, implicit $exec
43+
%2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 killed %1, 2, implicit $mode, implicit $exec
4444
$vgpr0 = COPY %2
4545
SI_RETURN_TO_EPILOG $vgpr0
4646
@@ -56,13 +56,13 @@ body: |
5656
; GFX12: liveins: $vgpr0
5757
; GFX12-NEXT: {{ $}}
5858
; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
59-
; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY]], [[COPY]], 228, 15, 15, -1, implicit $exec
60-
; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed [[V_MOV_B32_dpp]], 0, implicit $mode, implicit $exec
61-
; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_]]
59+
; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
60+
; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64_dpp [[DEF]], [[COPY]], 3, 228, 15, 15, 1, implicit $mode, implicit $exec
61+
; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_dpp]]
6262
; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
6363
%0:vgpr_32 = COPY $vgpr0
6464
%1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
65-
%2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed %1, 0, implicit $mode, implicit $exec
65+
%2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 killed %1, 3, implicit $mode, implicit $exec
6666
$vgpr0 = COPY %2
6767
SI_RETURN_TO_EPILOG $vgpr0
6868

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ define float @test_cvt_f32_bf8_byte1(i32 %a) {
4747
; GFX12-NEXT: s_wait_samplecnt 0x0
4848
; GFX12-NEXT: s_wait_bvhcnt 0x0
4949
; GFX12-NEXT: s_wait_kmcnt 0x0
50-
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1]
50+
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1
5151
; GFX12-NEXT: s_setpc_b64 s[30:31]
5252
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1)
5353
ret float %ret
@@ -67,7 +67,7 @@ define float @test_cvt_f32_bf8_byte2(i32 %a) {
6767
; GFX12-NEXT: s_wait_samplecnt 0x0
6868
; GFX12-NEXT: s_wait_bvhcnt 0x0
6969
; GFX12-NEXT: s_wait_kmcnt 0x0
70-
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0]
70+
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:2
7171
; GFX12-NEXT: s_setpc_b64 s[30:31]
7272
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2)
7373
ret float %ret
@@ -87,7 +87,7 @@ define float @test_cvt_f32_bf8_byte3(i32 %a) {
8787
; GFX12-NEXT: s_wait_samplecnt 0x0
8888
; GFX12-NEXT: s_wait_bvhcnt 0x0
8989
; GFX12-NEXT: s_wait_kmcnt 0x0
90-
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,1]
90+
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:3
9191
; GFX12-NEXT: s_setpc_b64 s[30:31]
9292
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 3)
9393
ret float %ret
@@ -127,7 +127,7 @@ define float @test_cvt_f32_fp8_byte1(i32 %a) {
127127
; GFX12-NEXT: s_wait_samplecnt 0x0
128128
; GFX12-NEXT: s_wait_bvhcnt 0x0
129129
; GFX12-NEXT: s_wait_kmcnt 0x0
130-
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[0,1]
130+
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1
131131
; GFX12-NEXT: s_setpc_b64 s[30:31]
132132
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
133133
ret float %ret
@@ -147,7 +147,7 @@ define float @test_cvt_f32_fp8_byte2(i32 %a) {
147147
; GFX12-NEXT: s_wait_samplecnt 0x0
148148
; GFX12-NEXT: s_wait_bvhcnt 0x0
149149
; GFX12-NEXT: s_wait_kmcnt 0x0
150-
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,0]
150+
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:2
151151
; GFX12-NEXT: s_setpc_b64 s[30:31]
152152
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 2)
153153
ret float %ret
@@ -167,7 +167,7 @@ define float @test_cvt_f32_fp8_byte3(i32 %a) {
167167
; GFX12-NEXT: s_wait_samplecnt 0x0
168168
; GFX12-NEXT: s_wait_bvhcnt 0x0
169169
; GFX12-NEXT: s_wait_kmcnt 0x0
170-
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1]
170+
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:3
171171
; GFX12-NEXT: s_setpc_b64 s[30:31]
172172
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 3)
173173
ret float %ret
@@ -552,7 +552,7 @@ define float @test_sext_cvt_f32_fp8(i16 %a) {
552552
; GFX12-NEXT: s_wait_kmcnt 0x0
553553
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
554554
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
555-
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[0,1]
555+
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1
556556
; GFX12-NEXT: s_setpc_b64 s[30:31]
557557
%a.sext = sext i16 %a to i32
558558
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a.sext, i32 1)
@@ -576,7 +576,7 @@ define float @test_sext_cvt_f32_bf8(i16 %a) {
576576
; GFX12-NEXT: s_wait_kmcnt 0x0
577577
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
578578
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
579-
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1]
579+
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1
580580
; GFX12-NEXT: s_setpc_b64 s[30:31]
581581
%a.sext = sext i16 %a to i32
582582
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a.sext, i32 1)

0 commit comments

Comments
 (0)